commit a978c8a9e6ec36ea5d6d89453f6afbdd5fc2df44 Author: Zack Buhman Date: Sat Apr 4 23:50:52 2026 -0500 initial diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5a58160 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.gdb_history +*.spv +*.o +main \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..dfcc342 --- /dev/null +++ b/Makefile @@ -0,0 +1,86 @@ +CC=$(PREFIX)gcc +CXX=$(PREFIX)g++ +OBJCOPY=$(PREFIX)objcopy + +OBJARCH = elf64-x86-64 + +OPT = -O0 -march=x86-64-v3 + +DEBUG = -g + +CSTD = -std=gnu23 +CXXSTD = -std=gnu++23 +CFLAGS += -Wall -Werror +CFLAGS += -Wfatal-errors +CFLAGS += -Wno-error=unused-variable -Wno-error=unused-but-set-variable +CFLAGS += -Wno-error=unused-function +CFLAGS += -Wno-error=array-bounds +CFLAGS += -Wno-unknown-pragmas +CFLAGS += -fno-strict-aliasing +CFLAGS += -I./include +CFLAGS += -I../SDL3-dist/include +CFLAGS += -I../volk +CFLAGS += -fpic + +FLAGS += -fstack-protector -fstack-protector-all -fno-omit-frame-pointer -fsanitize=address + +LDFLAGS += -lm +ifeq ($(shell uname),Linux) +LDFLAGS += -Wl,-z noexecstack +endif + +OBJS = \ + src/main.o + +BINS = \ + index.idx.o \ + position_normal_texture.vtx.o + +SHADERS = \ + shader/triangle.vs.spv.o \ + shader/triangle.ps.spv.o + +LIBS = \ + ../SDL3-dist/lib64/libSDL3.a \ + ../volk/volk.o + +all: main + +define BUILD_BINARY_O + $(OBJCOPY) -I binary -O $(OBJARCH) $< $@ +endef + +%.vtx.o: %.vtx + $(BUILD_BINARY_O) + +%.idx.o: %.idx + $(BUILD_BINARY_O) + +%.spv.o: %.spv + $(BUILD_BINARY_O) + +%.o: %.c + $(CC) $(ARCH) $(CSTD) $(CFLAGS) $(FLAGS) $(OPT) $(DEBUG) -c $< -o $@ + +%.o: %.cpp + $(CXX) $(ARCH) $(CXXSTD) $(CFLAGS) $(FLAGS) $(OPT) $(DEBUG) -c $< -o $@ + +main: $(OBJS) $(LIBS) $(BINS) $(SHADERS) + $(CC) $(ARCH) $(LDFLAGS) $(FLAGS) $(OPT) $(DEBUG) $^ -o $@ + +%.vs.spv: %.hlsl + ../dxc/bin/dxc -spirv -T vs_6_1 -E VSMain -fspv-target-env=vulkan1.3 $< -Fo $@ + +%.ps.spv: %.hlsl + ../dxc/bin/dxc -spirv -T ps_6_1 -E PSMain -fspv-target-env=vulkan1.3 $< -Fo $@ + +.SUFFIXES: +.INTERMEDIATE: +.SECONDARY: +.PHONY: all clean phony dist + +%: RCS/%,v +%: RCS/% +%: %,v +%: s.% +%: SCCS/s.% diff --git a/include/directxmath/directxmath.h b/include/directxmath/directxmath.h new file mode 100644 index 0000000..2190797 --- /dev/null +++ b/include/directxmath/directxmath.h @@ -0,0 +1,2231 @@ +//------------------------------------------------------------------------------------- +// DirectXMath.h -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// https://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#ifndef __cplusplus +#error DirectX Math requires C++ +#endif + +#define DIRECTX_MATH_VERSION 320 + +#if defined(_MSC_VER) && (_MSC_VER < 1910) +#error DirectX Math requires Visual C++ 2017 or later. +#endif + +#if defined(_MSC_VER) && !defined(_M_ARM) && !defined(_M_ARM64) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC) && (!_MANAGED) && (!_M_CEE) && (!defined(_M_IX86_FP) || (_M_IX86_FP > 1)) && !defined(_XM_NO_INTRINSICS_) && !defined(_XM_VECTORCALL_) +#define _XM_VECTORCALL_ 1 +#endif + +#if _XM_VECTORCALL_ +#define XM_CALLCONV __vectorcall +#elif defined(__GNUC__) +#define XM_CALLCONV +#else +#define XM_CALLCONV __fastcall +#endif + +#ifndef XM_DEPRECATED +#if (__cplusplus >= 201402L) +#define XM_DEPRECATED [[deprecated]] +#elif defined(__GNUC__) +#define XM_DEPRECATED __attribute__ ((deprecated)) +#else +#define XM_DEPRECATED __declspec(deprecated("This is deprecated and will be removed in a future version.")) +#endif +#endif + +#if !defined(_XM_AVX2_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_) +#define _XM_AVX2_INTRINSICS_ +#endif + +#if !defined(_XM_FMA3_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#define _XM_FMA3_INTRINSICS_ +#endif + +#if !defined(_XM_F16C_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#define _XM_F16C_INTRINSICS_ +#endif + +#if !defined(_XM_F16C_INTRINSICS_) && defined(__F16C__) && !defined(_XM_NO_INTRINSICS_) +#define _XM_F16C_INTRINSICS_ +#endif + +#if defined(_XM_FMA3_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_) +#define _XM_AVX_INTRINSICS_ +#endif + +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_) +#define _XM_AVX_INTRINSICS_ +#endif + +#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && !defined(_XM_NO_INTRINSICS_) +#define _XM_AVX_INTRINSICS_ +#endif + +#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_) +#define _XM_SSE4_INTRINSICS_ +#endif + +#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE3_INTRINSICS_) +#define _XM_SSE3_INTRINSICS_ +#endif + +#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) +#define _XM_SSE_INTRINSICS_ +#endif + +#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__ || __powerpc64__) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC) +#define _XM_SSE_INTRINSICS_ +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __arm__ || __aarch64__ +#define _XM_ARM_NEON_INTRINSICS_ +#elif !defined(_XM_NO_INTRINSICS_) +#error DirectX Math does not support this target +#endif +#endif // !_XM_ARM_NEON_INTRINSICS_ && !_XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +#if defined(_XM_SSE_INTRINSICS_) && defined(_MSC_VER) && (_MSC_VER >= 1920) && !defined(__clang__) && !defined(_XM_SVML_INTRINSICS_) && !defined(_XM_DISABLE_INTEL_SVML_) +#define _XM_SVML_INTRINSICS_ +#endif + +#if !defined(_XM_NO_XMVECTOR_OVERLOADS_) && (defined(__clang__) || defined(__GNUC__)) && !defined(_XM_NO_INTRINSICS_) +#define _XM_NO_XMVECTOR_OVERLOADS_ +#endif + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4514 4820) +// C4514/4820: Off by default noise +#endif +#include +#include +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#ifndef _XM_NO_INTRINSICS_ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4987) +// C4987: Off by default noise +#endif +#if defined(_MSC_VER) || defined(__MINGW32__) +#include +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#if (defined(__clang__) || defined(__GNUC__)) && (__x86_64__ || __i386__) && !defined(__MINGW32__) +#include +#endif + +#ifdef _XM_SSE_INTRINSICS_ +#include +#include + +#ifdef _XM_SSE3_INTRINSICS_ +#include +#endif + +#ifdef _XM_SSE4_INTRINSICS_ +#include +#endif + +#ifdef _XM_AVX_INTRINSICS_ +#include +#endif + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)) +#include +#else +#include +#endif +#endif +#endif // !_XM_NO_INTRINSICS_ + +#include "sal.h" +#include + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4005 4668) +// C4005/4668: Old header issue +#endif +#include +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#if (__cplusplus >= 201703L) +#define XM_ALIGNED_DATA(x) alignas(x) +#define XM_ALIGNED_STRUCT(x) struct alignas(x) +#elif defined(__GNUC__) +#define XM_ALIGNED_DATA(x) __attribute__ ((aligned(x))) +#define XM_ALIGNED_STRUCT(x) struct __attribute__ ((aligned(x))) +#else +#define XM_ALIGNED_DATA(x) __declspec(align(x)) +#define XM_ALIGNED_STRUCT(x) __declspec(align(x)) struct +#endif + +/**************************************************************************** + * + * Conditional intrinsics + * + ****************************************************************************/ + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +#if defined(_XM_NO_MOVNT_) +#define XM_STREAM_PS( p, a ) _mm_store_ps((p), (a)) +#define XM256_STREAM_PS( p, a ) _mm256_store_ps((p), (a)) +#define XM_SFENCE() +#else +#define XM_STREAM_PS( p, a ) _mm_stream_ps((p), (a)) +#define XM256_STREAM_PS( p, a ) _mm256_stream_ps((p), (a)) +#define XM_SFENCE() _mm_sfence() +#endif + +#if defined(_XM_FMA3_INTRINSICS_) +#define XM_FMADD_PS( a, b, c ) _mm_fmadd_ps((a), (b), (c)) +#define XM_FNMADD_PS( a, b, c ) _mm_fnmadd_ps((a), (b), (c)) +#else +#define XM_FMADD_PS( a, b, c ) _mm_add_ps(_mm_mul_ps((a), (b)), (c)) +#define XM_FNMADD_PS( a, b, c ) _mm_sub_ps((c), _mm_mul_ps((a), (b))) +#endif + +#if defined(_XM_AVX_INTRINSICS_) && defined(_XM_FAVOR_INTEL_) +#define XM_PERMUTE_PS( v, c ) _mm_permute_ps((v), c ) +#else +#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps((v), (v), c ) +#endif + +#if (defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 11)) || defined(__powerpc64__) +#define XM_LOADU_SI16( p ) _mm_cvtsi32_si128(*reinterpret_cast(p)) +#else +#define XM_LOADU_SI16( p ) _mm_loadu_si16(p) +#endif + +#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +#if defined(__clang__) || defined(__GNUC__) +#define XM_PREFETCH( a ) __builtin_prefetch(a) +#elif defined(_MSC_VER) +#define XM_PREFETCH( a ) __prefetch(a) +#else +#define XM_PREFETCH( a ) +#endif + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + + /**************************************************************************** + * + * Constant definitions + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XM_PI) +#undef XM_PI +#undef XM_2PI +#undef XM_1DIVPI +#undef XM_1DIV2PI +#undef XM_PIDIV2 +#undef XM_PIDIV4 +#undef XM_SELECT_0 +#undef XM_SELECT_1 +#undef XM_PERMUTE_0X +#undef XM_PERMUTE_0Y +#undef XM_PERMUTE_0Z +#undef XM_PERMUTE_0W +#undef XM_PERMUTE_1X +#undef XM_PERMUTE_1Y +#undef XM_PERMUTE_1Z +#undef XM_PERMUTE_1W +#undef XM_CRMASK_CR6 +#undef XM_CRMASK_CR6TRUE +#undef XM_CRMASK_CR6FALSE +#undef XM_CRMASK_CR6BOUNDS +#undef XM_CACHE_LINE_SIZE +#endif + + constexpr float XM_PI = 3.141592654f; + constexpr float XM_2PI = 6.283185307f; + constexpr float XM_1DIVPI = 0.318309886f; + constexpr float XM_1DIV2PI = 0.159154943f; + constexpr float XM_PIDIV2 = 1.570796327f; + constexpr float XM_PIDIV4 = 0.785398163f; + + constexpr uint32_t XM_SELECT_0 = 0x00000000; + constexpr uint32_t XM_SELECT_1 = 0xFFFFFFFF; + + constexpr uint32_t XM_PERMUTE_0X = 0; + constexpr uint32_t XM_PERMUTE_0Y = 1; + constexpr uint32_t XM_PERMUTE_0Z = 2; + constexpr uint32_t XM_PERMUTE_0W = 3; + constexpr uint32_t XM_PERMUTE_1X = 4; + constexpr uint32_t XM_PERMUTE_1Y = 5; + constexpr uint32_t XM_PERMUTE_1Z = 6; + constexpr uint32_t XM_PERMUTE_1W = 7; + + constexpr uint32_t XM_SWIZZLE_X = 0; + constexpr uint32_t XM_SWIZZLE_Y = 1; + constexpr uint32_t XM_SWIZZLE_Z = 2; + constexpr uint32_t XM_SWIZZLE_W = 3; + + constexpr uint32_t XM_CRMASK_CR6 = 0x000000F0; + constexpr uint32_t XM_CRMASK_CR6TRUE = 0x00000080; + constexpr uint32_t XM_CRMASK_CR6FALSE = 0x00000020; + constexpr uint32_t XM_CRMASK_CR6BOUNDS = XM_CRMASK_CR6FALSE; + +#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __arm__ || __aarch64__ + constexpr size_t XM_CACHE_LINE_SIZE = 128; +#else + constexpr size_t XM_CACHE_LINE_SIZE = 64; +#endif + + + /**************************************************************************** + * + * Macros + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XMComparisonAllTrue) +#undef XMComparisonAllTrue +#undef XMComparisonAnyTrue +#undef XMComparisonAllFalse +#undef XMComparisonAnyFalse +#undef XMComparisonMixed +#undef XMComparisonAllInBounds +#undef XMComparisonAnyOutOfBounds +#endif + + // Unit conversion + + constexpr float XMConvertToRadians(float fDegrees) noexcept { return fDegrees * (XM_PI / 180.0f); } + constexpr float XMConvertToDegrees(float fRadians) noexcept { return fRadians * (180.0f / XM_PI); } + + // Condition register evaluation proceeding a recording (R) comparison + + constexpr bool XMComparisonAllTrue(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6TRUE) == XM_CRMASK_CR6TRUE; } + constexpr bool XMComparisonAnyTrue(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6FALSE) != XM_CRMASK_CR6FALSE; } + constexpr bool XMComparisonAllFalse(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6FALSE) == XM_CRMASK_CR6FALSE; } + constexpr bool XMComparisonAnyFalse(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6TRUE) != XM_CRMASK_CR6TRUE; } + constexpr bool XMComparisonMixed(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6) == 0; } + constexpr bool XMComparisonAllInBounds(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6BOUNDS) == XM_CRMASK_CR6BOUNDS; } + constexpr bool XMComparisonAnyOutOfBounds(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6BOUNDS) != XM_CRMASK_CR6BOUNDS; } + + + /**************************************************************************** + * + * Data types + * + ****************************************************************************/ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4068 4201 4365 4324 4820) + // C4068: ignore unknown pragmas + // C4201: nonstandard extension used : nameless struct/union + // C4365: Off by default noise + // C4324/4820: padding warnings +#endif + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") +#endif + +//------------------------------------------------------------------------------ +#if defined(_XM_NO_INTRINSICS_) + struct __vector4 + { + union + { + float vector4_f32[4]; + uint32_t vector4_u32[4]; + }; + }; +#endif // _XM_NO_INTRINSICS_ + + //------------------------------------------------------------------------------ + // Vector intrinsic: Four 32 bit floating point components aligned on a 16 byte + // boundary and mapped to hardware vector registers +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + using XMVECTOR = __m128; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + using XMVECTOR = float32x4_t; +#else + using XMVECTOR = __vector4; +#endif + + // Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86, ARM, ARM64, and vector call; by reference otherwise +#if ( defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) || _XM_VECTORCALL_ || __i386__ || __arm__ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) + typedef const XMVECTOR FXMVECTOR; +#else + typedef const XMVECTOR& FXMVECTOR; +#endif + + // Fix-up for (4th) XMVECTOR parameter to pass in-register for ARM, ARM64, and vector call; by reference otherwise +#if ( defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || _XM_VECTORCALL_ || __arm__ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) + typedef const XMVECTOR GXMVECTOR; +#else + typedef const XMVECTOR& GXMVECTOR; +#endif + + // Fix-up for (5th & 6th) XMVECTOR parameter to pass in-register for ARM64 and vector call; by reference otherwise +#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || _XM_VECTORCALL_ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) + typedef const XMVECTOR HXMVECTOR; +#else + typedef const XMVECTOR& HXMVECTOR; +#endif + + // Fix-up for (7th+) XMVECTOR parameters to pass by reference + typedef const XMVECTOR& CXMVECTOR; + + //------------------------------------------------------------------------------ + // Conversion types for constants + XM_ALIGNED_STRUCT(16) XMVECTORF32 + { + union + { + float f[4]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const noexcept { return v; } + inline operator const float* () const noexcept { return f; } + #ifdef _XM_NO_INTRINSICS_ + #elif defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const noexcept { return _mm_castps_si128(v); } + inline operator __m128d() const noexcept { return _mm_castps_pd(v); } + #elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES)) + inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); } + inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); } + #endif + }; + + XM_ALIGNED_STRUCT(16) XMVECTORI32 + { + union + { + int32_t i[4]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const noexcept { return v; } + #ifdef _XM_NO_INTRINSICS_ + #elif defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const noexcept { return _mm_castps_si128(v); } + inline operator __m128d() const noexcept { return _mm_castps_pd(v); } + #elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES)) + inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); } + inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); } + #endif + }; + + XM_ALIGNED_STRUCT(16) XMVECTORU8 + { + union + { + uint8_t u[16]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const noexcept { return v; } + #ifdef _XM_NO_INTRINSICS_ + #elif defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const noexcept { return _mm_castps_si128(v); } + inline operator __m128d() const noexcept { return _mm_castps_pd(v); } + #elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES)) + inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); } + inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); } + #endif + }; + + XM_ALIGNED_STRUCT(16) XMVECTORU32 + { + union + { + uint32_t u[4]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const noexcept { return v; } + #ifdef _XM_NO_INTRINSICS_ + #elif defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const noexcept { return _mm_castps_si128(v); } + inline operator __m128d() const noexcept { return _mm_castps_pd(v); } + #elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES)) + inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); } + inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); } + #endif + }; + + //------------------------------------------------------------------------------ + // Vector operators + +#ifndef _XM_NO_XMVECTOR_OVERLOADS_ + XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV operator- (FXMVECTOR V) noexcept; + + XMVECTOR& XM_CALLCONV operator+= (XMVECTOR& V1, FXMVECTOR V2) noexcept; + XMVECTOR& XM_CALLCONV operator-= (XMVECTOR& V1, FXMVECTOR V2) noexcept; + XMVECTOR& XM_CALLCONV operator*= (XMVECTOR& V1, FXMVECTOR V2) noexcept; + XMVECTOR& XM_CALLCONV operator/= (XMVECTOR& V1, FXMVECTOR V2) noexcept; + + XMVECTOR& operator*= (XMVECTOR& V, float S) noexcept; + XMVECTOR& operator/= (XMVECTOR& V, float S) noexcept; + + XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV operator- (FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV operator* (FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV operator/ (FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV operator* (FXMVECTOR V, float S) noexcept; + XMVECTOR XM_CALLCONV operator* (float S, FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV operator/ (FXMVECTOR V, float S) noexcept; +#endif /* !_XM_NO_XMVECTOR_OVERLOADS_ */ + + //------------------------------------------------------------------------------ + // Matrix type: Sixteen 32 bit floating point components aligned on a + // 16 byte boundary and mapped to four hardware vector registers + + struct XMMATRIX; + + // Fix-up for (1st) XMMATRIX parameter to pass in-register for ARM64 and vector call; by reference otherwise +#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || _XM_VECTORCALL_ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) + typedef const XMMATRIX FXMMATRIX; +#else + typedef const XMMATRIX& FXMMATRIX; +#endif + + // Fix-up for (2nd+) XMMATRIX parameters to pass by reference + typedef const XMMATRIX& CXMMATRIX; + +#ifdef _XM_NO_INTRINSICS_ + struct XMMATRIX + #else + XM_ALIGNED_STRUCT(16) XMMATRIX + #endif + { + #ifdef _XM_NO_INTRINSICS_ + union + { + XMVECTOR r[4]; + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + float _41, _42, _43, _44; + }; + float m[4][4]; + }; + #else + XMVECTOR r[4]; + #endif + + XMMATRIX() = default; + + XMMATRIX(const XMMATRIX&) = default; + + #if defined(_MSC_VER) && (_MSC_FULL_VER < 191426431) + XMMATRIX& operator= (const XMMATRIX& M) noexcept { r[0] = M.r[0]; r[1] = M.r[1]; r[2] = M.r[2]; r[3] = M.r[3]; return *this; } + #else + XMMATRIX& operator=(const XMMATRIX&) = default; + + XMMATRIX(XMMATRIX&&) = default; + XMMATRIX& operator=(XMMATRIX&&) = default; + #endif + + constexpr XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3) noexcept : r{ R0,R1,R2,R3 } {} + XMMATRIX(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) noexcept; + explicit XMMATRIX(_In_reads_(16) const float* pArray) noexcept; + + #ifdef _XM_NO_INTRINSICS_ + float operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; } + #endif + + XMMATRIX operator+ () const noexcept { return *this; } + XMMATRIX operator- () const noexcept; + + XMMATRIX& XM_CALLCONV operator+= (FXMMATRIX M) noexcept; + XMMATRIX& XM_CALLCONV operator-= (FXMMATRIX M) noexcept; + XMMATRIX& XM_CALLCONV operator*= (FXMMATRIX M) noexcept; + XMMATRIX& operator*= (float S) noexcept; + XMMATRIX& operator/= (float S) noexcept; + + XMMATRIX XM_CALLCONV operator+ (FXMMATRIX M) const noexcept; + XMMATRIX XM_CALLCONV operator- (FXMMATRIX M) const noexcept; + XMMATRIX XM_CALLCONV operator* (FXMMATRIX M) const noexcept; + XMMATRIX operator* (float S) const noexcept; + XMMATRIX operator/ (float S) const noexcept; + + friend XMMATRIX XM_CALLCONV operator* (float S, FXMMATRIX M) noexcept; + }; + + //------------------------------------------------------------------------------ + // 2D Vector; 32 bit floating point components + struct XMFLOAT2 + { + float x; + float y; + + XMFLOAT2() = default; + + XMFLOAT2(const XMFLOAT2&) = default; + XMFLOAT2& operator=(const XMFLOAT2&) = default; + + XMFLOAT2(XMFLOAT2&&) = default; + XMFLOAT2& operator=(XMFLOAT2&&) = default; + + constexpr XMFLOAT2(float _x, float _y) noexcept : x(_x), y(_y) {} + explicit XMFLOAT2(_In_reads_(2) const float* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + }; + + // 2D Vector; 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT2A : public XMFLOAT2 + { + using XMFLOAT2::XMFLOAT2; + }; + + //------------------------------------------------------------------------------ + // 2D Vector; 32 bit signed integer components + struct XMINT2 + { + int32_t x; + int32_t y; + + XMINT2() = default; + + XMINT2(const XMINT2&) = default; + XMINT2& operator=(const XMINT2&) = default; + + XMINT2(XMINT2&&) = default; + XMINT2& operator=(XMINT2&&) = default; + + constexpr XMINT2(int32_t _x, int32_t _y) noexcept : x(_x), y(_y) {} + explicit XMINT2(_In_reads_(2) const int32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + }; + + // 2D Vector; 32 bit unsigned integer components + struct XMUINT2 + { + uint32_t x; + uint32_t y; + + XMUINT2() = default; + + XMUINT2(const XMUINT2&) = default; + XMUINT2& operator=(const XMUINT2&) = default; + + XMUINT2(XMUINT2&&) = default; + XMUINT2& operator=(XMUINT2&&) = default; + + constexpr XMUINT2(uint32_t _x, uint32_t _y) noexcept : x(_x), y(_y) {} + explicit XMUINT2(_In_reads_(2) const uint32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + }; + + //------------------------------------------------------------------------------ + // 3D Vector; 32 bit floating point components + struct XMFLOAT3 + { + float x; + float y; + float z; + + XMFLOAT3() = default; + + XMFLOAT3(const XMFLOAT3&) = default; + XMFLOAT3& operator=(const XMFLOAT3&) = default; + + XMFLOAT3(XMFLOAT3&&) = default; + XMFLOAT3& operator=(XMFLOAT3&&) = default; + + constexpr XMFLOAT3(float _x, float _y, float _z) noexcept : x(_x), y(_y), z(_z) {} + explicit XMFLOAT3(_In_reads_(3) const float* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + }; + + // 3D Vector; 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT3A : public XMFLOAT3 + { + using XMFLOAT3::XMFLOAT3; + }; + + //------------------------------------------------------------------------------ + // 3D Vector; 32 bit signed integer components + struct XMINT3 + { + int32_t x; + int32_t y; + int32_t z; + + XMINT3() = default; + + XMINT3(const XMINT3&) = default; + XMINT3& operator=(const XMINT3&) = default; + + XMINT3(XMINT3&&) = default; + XMINT3& operator=(XMINT3&&) = default; + + constexpr XMINT3(int32_t _x, int32_t _y, int32_t _z) noexcept : x(_x), y(_y), z(_z) {} + explicit XMINT3(_In_reads_(3) const int32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + }; + + // 3D Vector; 32 bit unsigned integer components + struct XMUINT3 + { + uint32_t x; + uint32_t y; + uint32_t z; + + XMUINT3() = default; + + XMUINT3(const XMUINT3&) = default; + XMUINT3& operator=(const XMUINT3&) = default; + + XMUINT3(XMUINT3&&) = default; + XMUINT3& operator=(XMUINT3&&) = default; + + constexpr XMUINT3(uint32_t _x, uint32_t _y, uint32_t _z) noexcept : x(_x), y(_y), z(_z) {} + explicit XMUINT3(_In_reads_(3) const uint32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + }; + + //------------------------------------------------------------------------------ + // 4D Vector; 32 bit floating point components + struct XMFLOAT4 + { + float x; + float y; + float z; + float w; + + XMFLOAT4() = default; + + XMFLOAT4(const XMFLOAT4&) = default; + XMFLOAT4& operator=(const XMFLOAT4&) = default; + + XMFLOAT4(XMFLOAT4&&) = default; + XMFLOAT4& operator=(XMFLOAT4&&) = default; + + constexpr XMFLOAT4(float _x, float _y, float _z, float _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMFLOAT4(_In_reads_(4) const float* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + }; + + // 4D Vector; 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT4A : public XMFLOAT4 + { + using XMFLOAT4::XMFLOAT4; + }; + + //------------------------------------------------------------------------------ + // 4D Vector; 32 bit signed integer components + struct XMINT4 + { + int32_t x; + int32_t y; + int32_t z; + int32_t w; + + XMINT4() = default; + + XMINT4(const XMINT4&) = default; + XMINT4& operator=(const XMINT4&) = default; + + XMINT4(XMINT4&&) = default; + XMINT4& operator=(XMINT4&&) = default; + + constexpr XMINT4(int32_t _x, int32_t _y, int32_t _z, int32_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMINT4(_In_reads_(4) const int32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + }; + + // 4D Vector; 32 bit unsigned integer components + struct XMUINT4 + { + uint32_t x; + uint32_t y; + uint32_t z; + uint32_t w; + + XMUINT4() = default; + + XMUINT4(const XMUINT4&) = default; + XMUINT4& operator=(const XMUINT4&) = default; + + XMUINT4(XMUINT4&&) = default; + XMUINT4& operator=(XMUINT4&&) = default; + + constexpr XMUINT4(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUINT4(_In_reads_(4) const uint32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + }; + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wgnu-anonymous-struct" +#pragma clang diagnostic ignored "-Wnested-anon-types" +#pragma clang diagnostic ignored "-Wunknown-warning-option" +#pragma clang diagnostic ignored "-Wunsafe-buffer-usage" +#endif + + //------------------------------------------------------------------------------ + // 3x3 Matrix: 32 bit floating point components + struct XMFLOAT3X3 + { + union + { + struct + { + float _11, _12, _13; + float _21, _22, _23; + float _31, _32, _33; + }; + float m[3][3]; + }; + + XMFLOAT3X3() = default; + + XMFLOAT3X3(const XMFLOAT3X3&) = default; + XMFLOAT3X3& operator=(const XMFLOAT3X3&) = default; + + XMFLOAT3X3(XMFLOAT3X3&&) = default; + XMFLOAT3X3& operator=(XMFLOAT3X3&&) = default; + + constexpr XMFLOAT3X3(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22) noexcept + : _11(m00), _12(m01), _13(m02), + _21(m10), _22(m11), _23(m12), + _31(m20), _32(m21), _33(m22) + {} + explicit XMFLOAT3X3(_In_reads_(9) const float* pArray) noexcept; + + float operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; } + }; + + //------------------------------------------------------------------------------ + // 4x3 Row-major Matrix: 32 bit floating point components + struct XMFLOAT4X3 + { + union + { + struct + { + float _11, _12, _13; + float _21, _22, _23; + float _31, _32, _33; + float _41, _42, _43; + }; + float m[4][3]; + float f[12]; + }; + + XMFLOAT4X3() = default; + + XMFLOAT4X3(const XMFLOAT4X3&) = default; + XMFLOAT4X3& operator=(const XMFLOAT4X3&) = default; + + XMFLOAT4X3(XMFLOAT4X3&&) = default; + XMFLOAT4X3& operator=(XMFLOAT4X3&&) = default; + + constexpr XMFLOAT4X3(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22, + float m30, float m31, float m32) noexcept + : _11(m00), _12(m01), _13(m02), + _21(m10), _22(m11), _23(m12), + _31(m20), _32(m21), _33(m22), + _41(m30), _42(m31), _43(m32) + {} + explicit XMFLOAT4X3(_In_reads_(12) const float* pArray) noexcept; + + float operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; } + }; + + // 4x3 Row-major Matrix: 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT4X3A : public XMFLOAT4X3 + { + using XMFLOAT4X3::XMFLOAT4X3; + }; + + //------------------------------------------------------------------------------ + // 3x4 Column-major Matrix: 32 bit floating point components + struct XMFLOAT3X4 + { + union + { + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + }; + float m[3][4]; + float f[12]; + }; + + XMFLOAT3X4() = default; + + XMFLOAT3X4(const XMFLOAT3X4&) = default; + XMFLOAT3X4& operator=(const XMFLOAT3X4&) = default; + + XMFLOAT3X4(XMFLOAT3X4&&) = default; + XMFLOAT3X4& operator=(XMFLOAT3X4&&) = default; + + constexpr XMFLOAT3X4(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23) noexcept + : _11(m00), _12(m01), _13(m02), _14(m03), + _21(m10), _22(m11), _23(m12), _24(m13), + _31(m20), _32(m21), _33(m22), _34(m23) + {} + explicit XMFLOAT3X4(_In_reads_(12) const float* pArray) noexcept; + + float operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; } + }; + + // 3x4 Column-major Matrix: 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT3X4A : public XMFLOAT3X4 + { + using XMFLOAT3X4::XMFLOAT3X4; + }; + + //------------------------------------------------------------------------------ + // 4x4 Matrix: 32 bit floating point components + struct XMFLOAT4X4 + { + union + { + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + float _41, _42, _43, _44; + }; + float m[4][4]; + }; + + XMFLOAT4X4() = default; + + XMFLOAT4X4(const XMFLOAT4X4&) = default; + XMFLOAT4X4& operator=(const XMFLOAT4X4&) = default; + + XMFLOAT4X4(XMFLOAT4X4&&) = default; + XMFLOAT4X4& operator=(XMFLOAT4X4&&) = default; + + constexpr XMFLOAT4X4(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) noexcept + : _11(m00), _12(m01), _13(m02), _14(m03), + _21(m10), _22(m11), _23(m12), _24(m13), + _31(m20), _32(m21), _33(m22), _34(m23), + _41(m30), _42(m31), _43(m32), _44(m33) + {} + explicit XMFLOAT4X4(_In_reads_(16) const float* pArray) noexcept; + + float operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; } + }; + + // 4x4 Matrix: 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT4X4A : public XMFLOAT4X4 + { + using XMFLOAT4X4::XMFLOAT4X4; + }; + + //////////////////////////////////////////////////////////////////////////////// + +#ifdef __clang__ +#pragma clang diagnostic pop +#endif +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +/**************************************************************************** + * + * Data conversion operations + * + ****************************************************************************/ + + XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat(FXMVECTOR VInt, uint32_t DivExponent) noexcept; + XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt(FXMVECTOR VFloat, uint32_t MulExponent) noexcept; + XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat(FXMVECTOR VUInt, uint32_t DivExponent) noexcept; + XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt(FXMVECTOR VFloat, uint32_t MulExponent) noexcept; + +#if defined(__XNAMATH_H__) && defined(XMVectorSetBinaryConstant) +#undef XMVectorSetBinaryConstant +#undef XMVectorSplatConstant +#undef XMVectorSplatConstantInt +#endif + + XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant) noexcept; + + /**************************************************************************** + * + * Load operations + * + ****************************************************************************/ + + XMVECTOR XM_CALLCONV XMLoadInt(_In_ const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat(_In_ const float* pSource) noexcept; + + XMVECTOR XM_CALLCONV XMLoadInt2(_In_reads_(2) const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadInt2A(_In_reads_(2) const uint32_t* PSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat2(_In_ const XMFLOAT2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat2A(_In_ const XMFLOAT2A* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadSInt2(_In_ const XMINT2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUInt2(_In_ const XMUINT2* pSource) noexcept; + + XMVECTOR XM_CALLCONV XMLoadInt3(_In_reads_(3) const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadInt3A(_In_reads_(3) const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat3(_In_ const XMFLOAT3* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat3A(_In_ const XMFLOAT3A* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadSInt3(_In_ const XMINT3* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUInt3(_In_ const XMUINT3* pSource) noexcept; + + XMVECTOR XM_CALLCONV XMLoadInt4(_In_reads_(4) const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadInt4A(_In_reads_(4) const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat4(_In_ const XMFLOAT4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat4A(_In_ const XMFLOAT4A* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadSInt4(_In_ const XMINT4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUInt4(_In_ const XMUINT4* pSource) noexcept; + + XMMATRIX XM_CALLCONV XMLoadFloat3x3(_In_ const XMFLOAT3X3* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat4x3(_In_ const XMFLOAT4X3* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat4x3A(_In_ const XMFLOAT4X3A* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat3x4(_In_ const XMFLOAT3X4* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat3x4A(_In_ const XMFLOAT3X4A* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat4x4(_In_ const XMFLOAT4X4* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat4x4A(_In_ const XMFLOAT4X4A* pSource) noexcept; + + /**************************************************************************** + * + * Store operations + * + ****************************************************************************/ + + void XM_CALLCONV XMStoreInt(_Out_ uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat(_Out_ float* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreInt2(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreInt2A(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat2(_Out_ XMFLOAT2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat2A(_Out_ XMFLOAT2A* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreSInt2(_Out_ XMINT2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUInt2(_Out_ XMUINT2* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreInt3(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreInt3A(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat3(_Out_ XMFLOAT3* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat3A(_Out_ XMFLOAT3A* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreSInt3(_Out_ XMINT3* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUInt3(_Out_ XMUINT3* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreInt4(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreInt4A(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat4(_Out_ XMFLOAT4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat4A(_Out_ XMFLOAT4A* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreSInt4(_Out_ XMINT4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUInt4(_Out_ XMUINT4* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreFloat3x3(_Out_ XMFLOAT3X3* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat4x3(_Out_ XMFLOAT4X3* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat4x3A(_Out_ XMFLOAT4X3A* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat3x4(_Out_ XMFLOAT3X4* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat3x4A(_Out_ XMFLOAT3X4A* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat4x4(_Out_ XMFLOAT4X4* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat4x4A(_Out_ XMFLOAT4X4A* pDestination, _In_ FXMMATRIX M) noexcept; + + /**************************************************************************** + * + * General vector operations + * + ****************************************************************************/ + + XMVECTOR XM_CALLCONV XMVectorZero() noexcept; + XMVECTOR XM_CALLCONV XMVectorSet(float x, float y, float z, float w) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w) noexcept; + XMVECTOR XM_CALLCONV XMVectorReplicate(float Value) noexcept; + XMVECTOR XM_CALLCONV XMVectorReplicatePtr(_In_ const float* pValue) noexcept; + XMVECTOR XM_CALLCONV XMVectorReplicateInt(uint32_t Value) noexcept; + XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr(_In_ const uint32_t* pValue) noexcept; + XMVECTOR XM_CALLCONV XMVectorTrueInt() noexcept; + XMVECTOR XM_CALLCONV XMVectorFalseInt() noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatX(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatY(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatZ(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatW(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatOne() noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatInfinity() noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatQNaN() noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatEpsilon() noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatSignMask() noexcept; + + float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i) noexcept; + float XM_CALLCONV XMVectorGetX(FXMVECTOR V) noexcept; + float XM_CALLCONV XMVectorGetY(FXMVECTOR V) noexcept; + float XM_CALLCONV XMVectorGetZ(FXMVECTOR V) noexcept; + float XM_CALLCONV XMVectorGetW(FXMVECTOR V) noexcept; + + void XM_CALLCONV XMVectorGetByIndexPtr(_Out_ float* f, _In_ FXMVECTOR V, _In_ size_t i) noexcept; + void XM_CALLCONV XMVectorGetXPtr(_Out_ float* x, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetYPtr(_Out_ float* y, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetZPtr(_Out_ float* z, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetWPtr(_Out_ float* w, _In_ FXMVECTOR V) noexcept; + + uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i) noexcept; + uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V) noexcept; + uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) noexcept; + uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) noexcept; + uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) noexcept; + + void XM_CALLCONV XMVectorGetIntByIndexPtr(_Out_ uint32_t* x, _In_ FXMVECTOR V, _In_ size_t i) noexcept; + void XM_CALLCONV XMVectorGetIntXPtr(_Out_ uint32_t* x, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t* y, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t* z, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t* w, _In_ FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) noexcept; + + XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(_In_ FXMVECTOR V, _In_ const float* f, _In_ size_t i) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetXPtr(_In_ FXMVECTOR V, _In_ const float* x) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetYPtr(_In_ FXMVECTOR V, _In_ const float* y) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetZPtr(_In_ FXMVECTOR V, _In_ const float* z) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetWPtr(_In_ FXMVECTOR V, _In_ const float* w) noexcept; + + XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) noexcept; + + XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(_In_ FXMVECTOR V, _In_ const uint32_t* x, _In_ size_t i) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(_In_ FXMVECTOR V, _In_ const uint32_t* x) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(_In_ FXMVECTOR V, _In_ const uint32_t* y) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(_In_ FXMVECTOR V, _In_ const uint32_t* z) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(_In_ FXMVECTOR V, _In_ const uint32_t* w) noexcept; + +#if defined(__XNAMATH_H__) && defined(XMVectorSwizzle) +#undef XMVectorSwizzle +#endif + + XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3) noexcept; + XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW) noexcept; + XMVECTOR XM_CALLCONV XMVectorSelectControl(uint32_t VectorIndex0, uint32_t VectorIndex1, uint32_t VectorIndex2, uint32_t VectorIndex3) noexcept; + XMVECTOR XM_CALLCONV XMVectorSelect(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Control) noexcept; + XMVECTOR XM_CALLCONV XMVectorMergeXY(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorMergeZW(FXMVECTOR V1, FXMVECTOR V2) noexcept; + +#if defined(__XNAMATH_H__) && defined(XMVectorShiftLeft) +#undef XMVectorShiftLeft +#undef XMVectorRotateLeft +#undef XMVectorRotateRight +#undef XMVectorInsert +#endif + + XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) noexcept; + XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) noexcept; + XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) noexcept; + XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements, + uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3) noexcept; + + XMVECTOR XM_CALLCONV XMVectorEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorEqualIntR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorNearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept; + XMVECTOR XM_CALLCONV XMVectorNotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorNotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorGreater(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorGreaterR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorLess(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorLessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorInBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept; + XMVECTOR XM_CALLCONV XMVectorInBoundsR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR Bounds) noexcept; + + XMVECTOR XM_CALLCONV XMVectorIsNaN(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorIsInfinite(FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVectorMin(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorMax(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorRound(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorTruncate(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorFloor(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorCeiling(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorClamp(FXMVECTOR V, FXMVECTOR Min, FXMVECTOR Max) noexcept; + XMVECTOR XM_CALLCONV XMVectorSaturate(FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVectorAndInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorAndCInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorOrInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorNorInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorXorInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + + XMVECTOR XM_CALLCONV XMVectorNegate(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorSum(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorAddAngles(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorSubtract(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorSubtractAngles(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorMultiply(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorMultiplyAdd(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept; + XMVECTOR XM_CALLCONV XMVectorDivide(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept; + XMVECTOR XM_CALLCONV XMVectorScale(FXMVECTOR V, float ScaleFactor) noexcept; + XMVECTOR XM_CALLCONV XMVectorReciprocalEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorReciprocal(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSqrtEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSqrt(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorExp2(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorExp10(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorExpE(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorExp(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorLog2(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorLog10(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorLogE(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorLog(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorPow(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorAbs(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorMod(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorModAngles(FXMVECTOR Angles) noexcept; + XMVECTOR XM_CALLCONV XMVectorSin(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSinEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorCos(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorCosEst(FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorSinCos(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorSinCosEst(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorTan(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorTanEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSinH(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorCosH(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorTanH(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorASin(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorASinEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorACos(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorACosEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorATanEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorATan2(FXMVECTOR Y, FXMVECTOR X) noexcept; + XMVECTOR XM_CALLCONV XMVectorATan2Est(FXMVECTOR Y, FXMVECTOR X) noexcept; + XMVECTOR XM_CALLCONV XMVectorLerp(FXMVECTOR V0, FXMVECTOR V1, float t) noexcept; + XMVECTOR XM_CALLCONV XMVectorLerpV(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR T) noexcept; + XMVECTOR XM_CALLCONV XMVectorHermite(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, float t) noexcept; + XMVECTOR XM_CALLCONV XMVectorHermiteV(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, HXMVECTOR T) noexcept; + XMVECTOR XM_CALLCONV XMVectorCatmullRom(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, float t) noexcept; + XMVECTOR XM_CALLCONV XMVectorCatmullRomV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, HXMVECTOR T) noexcept; + XMVECTOR XM_CALLCONV XMVectorBaryCentric(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, float f, float g) noexcept; + XMVECTOR XM_CALLCONV XMVectorBaryCentricV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR F, HXMVECTOR G) noexcept; + + /**************************************************************************** + * + * 2D vector operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMVector2Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector2EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector2EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept; + bool XM_CALLCONV XMVector2NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector2GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector2GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2Less(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept; + + bool XM_CALLCONV XMVector2IsNaN(FXMVECTOR V) noexcept; + bool XM_CALLCONV XMVector2IsInfinite(FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVector2Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector2Cross(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector2LengthSq(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2ReciprocalLength(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2LengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2Length(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2NormalizeEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2Normalize(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2ClampLength(FXMVECTOR V, float LengthMin, float LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector2ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector2Reflect(FXMVECTOR Incident, FXMVECTOR Normal) noexcept; + XMVECTOR XM_CALLCONV XMVector2Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector2RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector2Orthogonal(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector2LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point) noexcept; + XMVECTOR XM_CALLCONV XMVector2IntersectLine(FXMVECTOR Line1Point1, FXMVECTOR Line1Point2, FXMVECTOR Line2Point1, GXMVECTOR Line2Point2) noexcept; + XMVECTOR XM_CALLCONV XMVector2Transform(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT4* XM_CALLCONV XMVector2TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (VectorCount - 1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2) + InputStride * (VectorCount - 1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + XMVECTOR XM_CALLCONV XMVector2TransformCoord(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT2) + OutputStride * (VectorCount - 1)) XMFLOAT2* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2) + InputStride * (VectorCount - 1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + XMVECTOR XM_CALLCONV XMVector2TransformNormal(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT2) + OutputStride * (VectorCount - 1)) XMFLOAT2* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2) + InputStride * (VectorCount - 1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + + /**************************************************************************** + * + * 3D vector operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMVector3Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector3EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector3EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept; + bool XM_CALLCONV XMVector3NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector3GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector3GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3Less(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept; + + bool XM_CALLCONV XMVector3IsNaN(FXMVECTOR V) noexcept; + bool XM_CALLCONV XMVector3IsInfinite(FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVector3Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector3Cross(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector3LengthSq(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3ReciprocalLength(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3LengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3Length(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3NormalizeEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3Normalize(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3ClampLength(FXMVECTOR V, float LengthMin, float LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector3ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector3Reflect(FXMVECTOR Incident, FXMVECTOR Normal) noexcept; + XMVECTOR XM_CALLCONV XMVector3Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector3RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector3Orthogonal(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector3LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point) noexcept; + void XM_CALLCONV XMVector3ComponentsFromNormal(_Out_ XMVECTOR* pParallel, _Out_ XMVECTOR* pPerpendicular, _In_ FXMVECTOR V, _In_ FXMVECTOR Normal) noexcept; + XMVECTOR XM_CALLCONV XMVector3Rotate(FXMVECTOR V, FXMVECTOR RotationQuaternion) noexcept; + XMVECTOR XM_CALLCONV XMVector3InverseRotate(FXMVECTOR V, FXMVECTOR RotationQuaternion) noexcept; + XMVECTOR XM_CALLCONV XMVector3Transform(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT4* XM_CALLCONV XMVector3TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (VectorCount - 1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + XMVECTOR XM_CALLCONV XMVector3TransformCoord(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + XMVECTOR XM_CALLCONV XMVector3TransformNormal(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + XMVECTOR XM_CALLCONV XMVector3Project(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, + FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World) noexcept; + XMFLOAT3* XM_CALLCONV XMVector3ProjectStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, + _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, + _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World) noexcept; + XMVECTOR XM_CALLCONV XMVector3Unproject(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, + FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World) noexcept; + XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, + _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, + _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World) noexcept; + + /**************************************************************************** + * + * 4D vector operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMVector4Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector4EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector4EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept; + bool XM_CALLCONV XMVector4NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector4GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector4GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4Less(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept; + + bool XM_CALLCONV XMVector4IsNaN(FXMVECTOR V) noexcept; + bool XM_CALLCONV XMVector4IsInfinite(FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVector4Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector4Cross(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept; + XMVECTOR XM_CALLCONV XMVector4LengthSq(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4ReciprocalLength(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4LengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4Length(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4NormalizeEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4Normalize(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4ClampLength(FXMVECTOR V, float LengthMin, float LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector4ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector4Reflect(FXMVECTOR Incident, FXMVECTOR Normal) noexcept; + XMVECTOR XM_CALLCONV XMVector4Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector4RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector4Orthogonal(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector4Transform(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT4* XM_CALLCONV XMVector4TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (VectorCount - 1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT4) + InputStride * (VectorCount - 1)) const XMFLOAT4* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + + /**************************************************************************** + * + * Matrix operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M) noexcept; + bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M) noexcept; + bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M) noexcept; + + XMMATRIX XM_CALLCONV XMMatrixMultiply(FXMMATRIX M1, CXMMATRIX M2) noexcept; + XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose(FXMMATRIX M1, CXMMATRIX M2) noexcept; + XMMATRIX XM_CALLCONV XMMatrixTranspose(FXMMATRIX M) noexcept; + XMMATRIX XM_CALLCONV XMMatrixInverse(_Out_opt_ XMVECTOR* pDeterminant, _In_ FXMMATRIX M) noexcept; + XMMATRIX XM_CALLCONV XMMatrixVectorTensorProduct(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMMatrixDeterminant(FXMMATRIX M) noexcept; + + _Success_(return) + bool XM_CALLCONV XMMatrixDecompose(_Out_ XMVECTOR* outScale, _Out_ XMVECTOR* outRotQuat, _Out_ XMVECTOR* outTrans, _In_ FXMMATRIX M) noexcept; + + XMMATRIX XM_CALLCONV XMMatrixIdentity() noexcept; + XMMATRIX XM_CALLCONV XMMatrixSet(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) noexcept; + XMMATRIX XM_CALLCONV XMMatrixTranslation(float OffsetX, float OffsetY, float OffsetZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector(FXMVECTOR Offset) noexcept; + XMMATRIX XM_CALLCONV XMMatrixScaling(float ScaleX, float ScaleY, float ScaleZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle) noexcept; + + // Rotates about y-axis (Yaw), then x-axis (Pitch), then z-axis (Roll) + XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw(float Pitch, float Yaw, float Roll) noexcept; + + // Rotates about y-axis (Angles.y), then x-axis (Angles.x), then z-axis (Angles.z) + XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector(FXMVECTOR Angles) noexcept; + + XMMATRIX XM_CALLCONV XMMatrixRotationNormal(FXMVECTOR NormalAxis, float Angle) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationAxis(FXMVECTOR Axis, float Angle) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion(FXMVECTOR Quaternion) noexcept; + XMMATRIX XM_CALLCONV XMMatrixTransformation2D(FXMVECTOR ScalingOrigin, float ScalingOrientation, FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, float Rotation, GXMVECTOR Translation) noexcept; + XMMATRIX XM_CALLCONV XMMatrixTransformation(FXMVECTOR ScalingOrigin, FXMVECTOR ScalingOrientationQuaternion, FXMVECTOR Scaling, + GXMVECTOR RotationOrigin, HXMVECTOR RotationQuaternion, HXMVECTOR Translation) noexcept; + XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, float Rotation, FXMVECTOR Translation) noexcept; + XMMATRIX XM_CALLCONV XMMatrixAffineTransformation(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FXMVECTOR RotationQuaternion, GXMVECTOR Translation) noexcept; + XMMATRIX XM_CALLCONV XMMatrixReflect(FXMVECTOR ReflectionPlane) noexcept; + XMMATRIX XM_CALLCONV XMMatrixShadow(FXMVECTOR ShadowPlane, FXMVECTOR LightPosition) noexcept; + + XMMATRIX XM_CALLCONV XMMatrixLookAtLH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection) noexcept; + XMMATRIX XM_CALLCONV XMMatrixLookAtRH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection) noexcept; + XMMATRIX XM_CALLCONV XMMatrixLookToLH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection) noexcept; + XMMATRIX XM_CALLCONV XMMatrixLookToRH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH(float FovAngleY, float AspectRatio, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH(float FovAngleY, float AspectRatio, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixOrthographicLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixOrthographicRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept; + + + /**************************************************************************** + * + * Quaternion operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMQuaternionEqual(FXMVECTOR Q1, FXMVECTOR Q2) noexcept; + bool XM_CALLCONV XMQuaternionNotEqual(FXMVECTOR Q1, FXMVECTOR Q2) noexcept; + + bool XM_CALLCONV XMQuaternionIsNaN(FXMVECTOR Q) noexcept; + bool XM_CALLCONV XMQuaternionIsInfinite(FXMVECTOR Q) noexcept; + bool XM_CALLCONV XMQuaternionIsIdentity(FXMVECTOR Q) noexcept; + + XMVECTOR XM_CALLCONV XMQuaternionDot(FXMVECTOR Q1, FXMVECTOR Q2) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionMultiply(FXMVECTOR Q1, FXMVECTOR Q2) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionLengthSq(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionLength(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionNormalize(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionConjugate(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionInverse(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionLn(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionExp(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionSlerp(FXMVECTOR Q0, FXMVECTOR Q1, float t) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionSlerpV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR T) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionSquad(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, float t) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionSquadV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, HXMVECTOR T) noexcept; + void XM_CALLCONV XMQuaternionSquadSetup(_Out_ XMVECTOR* pA, _Out_ XMVECTOR* pB, _Out_ XMVECTOR* pC, _In_ FXMVECTOR Q0, _In_ FXMVECTOR Q1, _In_ FXMVECTOR Q2, _In_ GXMVECTOR Q3) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionBaryCentric(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, float f, float g) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR F, HXMVECTOR G) noexcept; + + XMVECTOR XM_CALLCONV XMQuaternionIdentity() noexcept; + + // Rotates about y-axis (Yaw), then x-axis (Pitch), then z-axis (Roll) + XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw(float Pitch, float Yaw, float Roll) noexcept; + + // Rotates about y-axis (Angles.y), then x-axis (Angles.x), then z-axis (Angles.z) + XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector(FXMVECTOR Angles) noexcept; + + XMVECTOR XM_CALLCONV XMQuaternionRotationNormal(FXMVECTOR NormalAxis, float Angle) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionRotationAxis(FXMVECTOR Axis, float Angle) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M) noexcept; + + void XM_CALLCONV XMQuaternionToAxisAngle(_Out_ XMVECTOR* pAxis, _Out_ float* pAngle, _In_ FXMVECTOR Q) noexcept; + + /**************************************************************************** + * + * Plane operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMPlaneEqual(FXMVECTOR P1, FXMVECTOR P2) noexcept; + bool XM_CALLCONV XMPlaneNearEqual(FXMVECTOR P1, FXMVECTOR P2, FXMVECTOR Epsilon) noexcept; + bool XM_CALLCONV XMPlaneNotEqual(FXMVECTOR P1, FXMVECTOR P2) noexcept; + + bool XM_CALLCONV XMPlaneIsNaN(FXMVECTOR P) noexcept; + bool XM_CALLCONV XMPlaneIsInfinite(FXMVECTOR P) noexcept; + + XMVECTOR XM_CALLCONV XMPlaneDot(FXMVECTOR P, FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMPlaneDotCoord(FXMVECTOR P, FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMPlaneDotNormal(FXMVECTOR P, FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMPlaneNormalizeEst(FXMVECTOR P) noexcept; + XMVECTOR XM_CALLCONV XMPlaneNormalize(FXMVECTOR P) noexcept; + XMVECTOR XM_CALLCONV XMPlaneIntersectLine(FXMVECTOR P, FXMVECTOR LinePoint1, FXMVECTOR LinePoint2) noexcept; + void XM_CALLCONV XMPlaneIntersectPlane(_Out_ XMVECTOR* pLinePoint1, _Out_ XMVECTOR* pLinePoint2, _In_ FXMVECTOR P1, _In_ FXMVECTOR P2) noexcept; + + // Transforms a plane given an inverse transpose matrix + XMVECTOR XM_CALLCONV XMPlaneTransform(FXMVECTOR P, FXMMATRIX ITM) noexcept; + + // Transforms an array of planes given an inverse transpose matrix + XMFLOAT4* XM_CALLCONV XMPlaneTransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (PlaneCount - 1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT4) + InputStride * (PlaneCount - 1)) const XMFLOAT4* pInputStream, + _In_ size_t InputStride, _In_ size_t PlaneCount, _In_ FXMMATRIX ITM) noexcept; + + XMVECTOR XM_CALLCONV XMPlaneFromPointNormal(FXMVECTOR Point, FXMVECTOR Normal) noexcept; + XMVECTOR XM_CALLCONV XMPlaneFromPoints(FXMVECTOR Point1, FXMVECTOR Point2, FXMVECTOR Point3) noexcept; + + /**************************************************************************** + * + * Color operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMColorEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept; + bool XM_CALLCONV XMColorNotEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept; + bool XM_CALLCONV XMColorGreater(FXMVECTOR C1, FXMVECTOR C2) noexcept; + bool XM_CALLCONV XMColorGreaterOrEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept; + bool XM_CALLCONV XMColorLess(FXMVECTOR C1, FXMVECTOR C2) noexcept; + bool XM_CALLCONV XMColorLessOrEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept; + + bool XM_CALLCONV XMColorIsNaN(FXMVECTOR C) noexcept; + bool XM_CALLCONV XMColorIsInfinite(FXMVECTOR C) noexcept; + + XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR C) noexcept; + XMVECTOR XM_CALLCONV XMColorModulate(FXMVECTOR C1, FXMVECTOR C2) noexcept; + XMVECTOR XM_CALLCONV XMColorAdjustSaturation(FXMVECTOR C, float Saturation) noexcept; + XMVECTOR XM_CALLCONV XMColorAdjustContrast(FXMVECTOR C, float Contrast) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToHSL(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorHSLToRGB(FXMVECTOR hsl) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToHSV(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorHSVToRGB(FXMVECTOR hsv) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToYUV(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorYUVToRGB(FXMVECTOR yuv) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD(FXMVECTOR yuv) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToYUV_UHD(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorYUVToRGB_UHD(FXMVECTOR yuv) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToXYZ(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorXYZToRGB(FXMVECTOR xyz) noexcept; + + XMVECTOR XM_CALLCONV XMColorXYZToSRGB(FXMVECTOR xyz) noexcept; + XMVECTOR XM_CALLCONV XMColorSRGBToXYZ(FXMVECTOR srgb) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToSRGB(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorSRGBToRGB(FXMVECTOR srgb) noexcept; + + + /**************************************************************************** + * + * Miscellaneous operations + * + ****************************************************************************/ + + bool XMVerifyCPUSupport() noexcept; + + XMVECTOR XM_CALLCONV XMFresnelTerm(FXMVECTOR CosIncidentAngle, FXMVECTOR RefractionIndex) noexcept; + + bool XMScalarNearEqual(float S1, float S2, float Epsilon) noexcept; + float XMScalarModAngle(float Value) noexcept; + + float XMScalarSin(float Value) noexcept; + float XMScalarSinEst(float Value) noexcept; + + float XMScalarCos(float Value) noexcept; + float XMScalarCosEst(float Value) noexcept; + + void XMScalarSinCos(_Out_ float* pSin, _Out_ float* pCos, float Value) noexcept; + void XMScalarSinCosEst(_Out_ float* pSin, _Out_ float* pCos, float Value) noexcept; + + float XMScalarASin(float Value) noexcept; + float XMScalarASinEst(float Value) noexcept; + + float XMScalarACos(float Value) noexcept; + float XMScalarACosEst(float Value) noexcept; + + /**************************************************************************** + * + * Templates + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XMMin) +#undef XMMin +#undef XMMax +#endif + + template inline T XMMin(T a, T b) noexcept { return (a < b) ? a : b; } + template inline T XMMax(T a, T b) noexcept { return (a > b) ? a : b; } + + //------------------------------------------------------------------------------ + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +// PermuteHelper internal template (SSE only) + namespace MathInternal + { + // Slow path fallback for permutes that do not map to a single SSE shuffle opcode. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept + { + static const XMVECTORU32 selectMask = + { { { + WhichX ? 0xFFFFFFFF : 0, + WhichY ? 0xFFFFFFFF : 0, + WhichZ ? 0xFFFFFFFF : 0, + WhichW ? 0xFFFFFFFF : 0, + } } }; + + XMVECTOR shuffled1 = XM_PERMUTE_PS(v1, Shuffle); + XMVECTOR shuffled2 = XM_PERMUTE_PS(v2, Shuffle); + + XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1); + XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2); + + return _mm_or_ps(masked1, masked2); + } + }; + + // Fast path for permutes that only read from the first vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR) noexcept { return XM_PERMUTE_PS(v1, Shuffle); } + }; + + // Fast path for permutes that only read from the second vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR, FXMVECTOR v2) noexcept { return XM_PERMUTE_PS(v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the first vector, ZW from the second. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept { return _mm_shuffle_ps(v1, v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the second vector, ZW from the first. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept { return _mm_shuffle_ps(v2, v1, Shuffle); } + }; + } + +#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + + // General permute template + template + inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) noexcept + { + static_assert(PermuteX <= 7, "PermuteX template parameter out of range"); + static_assert(PermuteY <= 7, "PermuteY template parameter out of range"); + static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range"); + static_assert(PermuteW <= 7, "PermuteW template parameter out of range"); + + #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + constexpr uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3); + + constexpr bool WhichX = PermuteX > 3; + constexpr bool WhichY = PermuteY > 3; + constexpr bool WhichZ = PermuteZ > 3; + constexpr bool WhichW = PermuteW > 3; + + return MathInternal::PermuteHelper::Permute(V1, V2); + #else + + return XMVectorPermute(V1, V2, PermuteX, PermuteY, PermuteZ, PermuteW); + + #endif + } + + // Special-case permute templates + template<> constexpr XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 2, 3>(FXMVECTOR V1, FXMVECTOR) noexcept { return V1; } + template<> constexpr XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 6, 7>(FXMVECTOR, FXMVECTOR V2) noexcept { return V2; } + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_movelh_ps(V1, V2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<6, 7, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_movehl_ps(V1, V2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 4, 1, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_unpacklo_ps(V1, V2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 6, 3, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_unpackhi_ps(V1, V2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(V1), _mm_castps_pd(V2))); } +#endif + +#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x1); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x3); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x4); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x5); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x6); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x7); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x8); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x9); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xA); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xB); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xC); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xD); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xE); } +#endif + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + + // If the indices are all in the range 0-3 or 4-7, then use XMVectorSwizzle instead + // The mirror cases are not spelled out here as the programmer can always swap the arguments + // (i.e. prefer permutes where the X element comes from the V1 vector instead of the V2 vector) + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vget_low_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vget_low_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vrev64_f32(vget_low_f32(V2))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vrev64_f32(vget_low_f32(V2))); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_high_f32(V1), vget_high_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vget_high_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_high_f32(V1), vrev64_f32(vget_high_f32(V2))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vrev64_f32(vget_high_f32(V2))); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vget_high_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vget_high_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vrev64_f32(vget_high_f32(V2))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vrev64_f32(vget_high_f32(V2))); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vget_low_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_high_f32(V1), vrev64_f32(vget_low_f32(V2))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vrev64_f32(vget_low_f32(V2))); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 4, 2, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vtrnq_f32(V1, V2).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 5, 3, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vtrnq_f32(V1, V2).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 4, 1, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vzipq_f32(V1, V2).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 6, 3, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vzipq_f32(V1, V2).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 2, 4, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vuzpq_f32(V1, V2).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 3, 5, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vuzpq_f32(V1, V2).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 2, 3, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vextq_f32(V1, V2, 1); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vextq_f32(V1, V2, 2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 4, 5, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vextq_f32(V1, V2, 3); } + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + + //------------------------------------------------------------------------------ + + // General swizzle template + template + inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V) noexcept + { + static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range"); + static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range"); + static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range"); + static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range"); + + #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + return XM_PERMUTE_PS(V, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX)); + #else + + return XMVectorSwizzle(V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW); + + #endif + } + + // Specialized swizzles + template<> constexpr XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 2, 3>(FXMVECTOR V) noexcept { return V; } + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 0, 1>(FXMVECTOR V) noexcept { return _mm_movelh_ps(V, V); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 2, 3>(FXMVECTOR V) noexcept { return _mm_movehl_ps(V, V); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 1, 1>(FXMVECTOR V) noexcept { return _mm_unpacklo_ps(V, V); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 2, 3, 3>(FXMVECTOR V) noexcept { return _mm_unpackhi_ps(V, V); } +#endif + +#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 2, 2>(FXMVECTOR V) noexcept { return _mm_moveldup_ps(V); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 3, 3>(FXMVECTOR V) noexcept { return _mm_movehdup_ps(V); } +#endif + +#if defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) && defined(_XM_FAVOR_INTEL_) + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 0, 0>(FXMVECTOR V) noexcept { return _mm_broadcastss_ps(V); } +#endif + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 0, 0>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_low_f32(V), 0); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 1, 1>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_low_f32(V), 1); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 2, 2, 2>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_high_f32(V), 0); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 3, 3, 3>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_high_f32(V), 1); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 0, 3, 2>(FXMVECTOR V) noexcept { return vrev64q_f32(V); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 0, 1>(FXMVECTOR V) noexcept { float32x2_t vt = vget_low_f32(V); return vcombine_f32(vt, vt); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 2, 3>(FXMVECTOR V) noexcept { float32x2_t vt = vget_high_f32(V); return vcombine_f32(vt, vt); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 0, 1, 0>(FXMVECTOR V) noexcept { float32x2_t vt = vrev64_f32(vget_low_f32(V)); return vcombine_f32(vt, vt); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 2, 3, 2>(FXMVECTOR V) noexcept { float32x2_t vt = vrev64_f32(vget_high_f32(V)); return vcombine_f32(vt, vt); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 3, 2>(FXMVECTOR V) noexcept { return vcombine_f32(vget_low_f32(V), vrev64_f32(vget_high_f32(V))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 0, 2, 3>(FXMVECTOR V) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V)), vget_high_f32(V)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 1, 0>(FXMVECTOR V) noexcept { return vcombine_f32(vget_high_f32(V), vrev64_f32(vget_low_f32(V))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 2, 0, 1>(FXMVECTOR V) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V)), vget_low_f32(V)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 2, 1, 0>(FXMVECTOR V) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V)), vrev64_f32(vget_low_f32(V))); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 2, 2>(FXMVECTOR V) noexcept { return vtrnq_f32(V, V).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 3, 3>(FXMVECTOR V) noexcept { return vtrnq_f32(V, V).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 1, 1>(FXMVECTOR V) noexcept { return vzipq_f32(V, V).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 2, 3, 3>(FXMVECTOR V) noexcept { return vzipq_f32(V, V).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 2, 0, 2>(FXMVECTOR V) noexcept { return vuzpq_f32(V, V).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 3, 1, 3>(FXMVECTOR V) noexcept { return vuzpq_f32(V, V).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 2, 3, 0>(FXMVECTOR V) noexcept { return vextq_f32(V, V, 1); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 0, 1>(FXMVECTOR V) noexcept { return vextq_f32(V, V, 2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 0, 1, 2>(FXMVECTOR V) noexcept { return vextq_f32(V, V, 3); } + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + + //------------------------------------------------------------------------------ + + template + inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) noexcept + { + static_assert(Elements < 4, "Elements template parameter out of range"); + return XMVectorPermute(V1, V2); + } + + template + inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V) noexcept + { + static_assert(Elements < 4, "Elements template parameter out of range"); + return XMVectorSwizzle(V); + } + + template + inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V) noexcept + { + static_assert(Elements < 4, "Elements template parameter out of range"); + return XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V); + } + + template + inline XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS) noexcept + { + XMVECTOR Control = XMVectorSelectControl(Select0 & 1, Select1 & 1, Select2 & 1, Select3 & 1); + return XMVectorSelect(VD, XMVectorRotateLeft(VS), Control); + } + + /**************************************************************************** + * + * Globals + * + ****************************************************************************/ + + // The purpose of the following global constants is to prevent redundant + // reloading of the constants when they are referenced by more than one + // separate inline math routine called within the same function. Declaring + // a constant locally within a routine is sufficient to prevent redundant + // reloads of that constant when that single routine is called multiple + // times in a function, but if the constant is used (and declared) in a + // separate math routine it would be reloaded. + +#ifndef XMGLOBALCONST +#if defined(__GNUC__) && !defined(__MINGW32__) +#define XMGLOBALCONST extern const __attribute__((weak)) +#else +#define XMGLOBALCONST extern const __declspec(selectany) +#endif +#endif + + XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients0 = { { { -0.16666667f, +0.0083333310f, -0.00019840874f, +2.7525562e-06f } } }; + XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients1 = { { { -2.3889859e-08f, -0.16665852f /*Est1*/, +0.0083139502f /*Est2*/, -0.00018524670f /*Est3*/ } } }; + XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients0 = { { { -0.5f, +0.041666638f, -0.0013888378f, +2.4760495e-05f } } }; + XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients1 = { { { -2.6051615e-07f, -0.49992746f /*Est1*/, +0.041493919f /*Est2*/, -0.0012712436f /*Est3*/ } } }; + XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients0 = { { { 1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f } } }; + XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients1 = { { { 2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f } } }; + XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients2 = { { { 5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f } } }; + XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients0 = { { { +1.5707963050f, -0.2145988016f, +0.0889789874f, -0.0501743046f } } }; + XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients1 = { { { +0.0308918810f, -0.0170881256f, +0.0066700901f, -0.0012624911f } } }; + XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients0 = { { { -0.3333314528f, +0.1999355085f, -0.1420889944f, +0.1065626393f } } }; + XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients1 = { { { -0.0752896400f, +0.0429096138f, -0.0161657367f, +0.0028662257f } } }; + XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients0 = { { { +0.999866f, +0.999866f, +0.999866f, +0.999866f } } }; + XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients1 = { { { -0.3302995f, +0.180141f, -0.085133f, +0.0208351f } } }; + XMGLOBALCONST XMVECTORF32 g_XMTanEstCoefficients = { { { 2.484f, -1.954923183e-1f, 2.467401101f, XM_1DIVPI } } }; + XMGLOBALCONST XMVECTORF32 g_XMArcEstCoefficients = { { { +1.5707288f, -0.2121144f, +0.0742610f, -0.0187293f } } }; + XMGLOBALCONST XMVECTORF32 g_XMPiConstants0 = { { { XM_PI, XM_2PI, XM_1DIVPI, XM_1DIV2PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMIdentityR0 = { { { 1.0f, 0.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMIdentityR1 = { { { 0.0f, 1.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMIdentityR2 = { { { 0.0f, 0.0f, 1.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMIdentityR3 = { { { 0.0f, 0.0f, 0.0f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0 = { { { -1.0f, 0.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1 = { { { 0.0f, -1.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2 = { { { 0.0f, 0.0f, -1.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3 = { { { 0.0f, 0.0f, 0.0f, -1.0f } } }; + XMGLOBALCONST XMVECTORU32 g_XMNegativeZero = { { { 0x80000000, 0x80000000, 0x80000000, 0x80000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMNegate3 = { { { 0x80000000, 0x80000000, 0x80000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskXY = { { { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMask3 = { { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskX = { { { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskY = { { { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskZ = { { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskW = { { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF } } }; + XMGLOBALCONST XMVECTORF32 g_XMOne = { { { 1.0f, 1.0f, 1.0f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMOne3 = { { { 1.0f, 1.0f, 1.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMZero = { { { 0.0f, 0.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMTwo = { { { 2.f, 2.f, 2.f, 2.f } } }; + XMGLOBALCONST XMVECTORF32 g_XMFour = { { { 4.f, 4.f, 4.f, 4.f } } }; + XMGLOBALCONST XMVECTORF32 g_XMSix = { { { 6.f, 6.f, 6.f, 6.f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegativeOne = { { { -1.0f, -1.0f, -1.0f, -1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMOneHalf = { { { 0.5f, 0.5f, 0.5f, 0.5f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegativeOneHalf = { { { -0.5f, -0.5f, -0.5f, -0.5f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegativeTwoPi = { { { -XM_2PI, -XM_2PI, -XM_2PI, -XM_2PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegativePi = { { { -XM_PI, -XM_PI, -XM_PI, -XM_PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMHalfPi = { { { XM_PIDIV2, XM_PIDIV2, XM_PIDIV2, XM_PIDIV2 } } }; + XMGLOBALCONST XMVECTORF32 g_XMPi = { { { XM_PI, XM_PI, XM_PI, XM_PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMReciprocalPi = { { { XM_1DIVPI, XM_1DIVPI, XM_1DIVPI, XM_1DIVPI } } }; + XMGLOBALCONST XMVECTORF32 g_XMTwoPi = { { { XM_2PI, XM_2PI, XM_2PI, XM_2PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMReciprocalTwoPi = { { { XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMEpsilon = { { { 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f } } }; + XMGLOBALCONST XMVECTORI32 g_XMInfinity = { { { 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMQNaN = { { { 0x7FC00000, 0x7FC00000, 0x7FC00000, 0x7FC00000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMQNaNTest = { { { 0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF } } }; + XMGLOBALCONST XMVECTORI32 g_XMAbsMask = { { { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF } } }; + XMGLOBALCONST XMVECTORI32 g_XMFltMin = { { { 0x00800000, 0x00800000, 0x00800000, 0x00800000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMFltMax = { { { 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF } } }; + XMGLOBALCONST XMVECTORU32 g_XMNegOneMask = { { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskA8R8G8B8 = { { { 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipA8R8G8B8 = { { { 0x00000000, 0x00000000, 0x00000000, 0x80000000 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixAA8R8G8B8 = { { { 0.0f, 0.0f, 0.0f, float(0x80000000U) } } }; + XMGLOBALCONST XMVECTORF32 g_XMNormalizeA8R8G8B8 = { { { 1.0f / (255.0f * float(0x10000)), 1.0f / (255.0f * float(0x100)), 1.0f / 255.0f, 1.0f / (255.0f * float(0x1000000)) } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskA2B10G10R10 = { { { 0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipA2B10G10R10 = { { { 0x00000200, 0x00080000, 0x20000000, 0x80000000 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixAA2B10G10R10 = { { { -512.0f, -512.0f * float(0x400), -512.0f * float(0x100000), float(0x80000000U) } } }; + XMGLOBALCONST XMVECTORF32 g_XMNormalizeA2B10G10R10 = { { { 1.0f / 511.0f, 1.0f / (511.0f * float(0x400)), 1.0f / (511.0f * float(0x100000)), 1.0f / (3.0f * float(0x40000000)) } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16 = { { { 0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16 = { { { 0x00008000, 0x00000000, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16 = { { { -32768.0f, 0.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16 = { { { 1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16Z16W16 = { { { 0x0000FFFF, 0x0000FFFF, 0xFFFF0000, 0xFFFF0000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16Z16W16 = { { { 0x00008000, 0x00008000, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16Z16W16 = { { { -32768.0f, -32768.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16Z16W16 = { { { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 1.0f / (32767.0f * 65536.0f) } } }; + XMGLOBALCONST XMVECTORF32 g_XMNoFraction = { { { 8388608.0f, 8388608.0f, 8388608.0f, 8388608.0f } } }; + XMGLOBALCONST XMVECTORI32 g_XMMaskByte = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegateX = { { { -1.0f, 1.0f, 1.0f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegateY = { { { 1.0f, -1.0f, 1.0f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegateZ = { { { 1.0f, 1.0f, -1.0f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegateW = { { { 1.0f, 1.0f, 1.0f, -1.0f } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect0101 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1 } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect1010 = { { { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } }; + XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = { { { 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect1000 = { { { XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect1100 = { { { XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect1110 = { { { XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect1011 = { { { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixupY16 = { { { 1.0f, 1.0f / 65536.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16 = { { { 1.0f, 1.0f, 1.0f / 65536.0f, 1.0f / 65536.0f } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipY = { { { 0, 0x80000000, 0, 0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipZ = { { { 0, 0, 0x80000000, 0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipW = { { { 0, 0, 0, 0x80000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipYZ = { { { 0, 0x80000000, 0x80000000, 0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipZW = { { { 0, 0, 0x80000000, 0x80000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipYW = { { { 0, 0x80000000, 0, 0x80000000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMMaskDec4 = { { { 0x3FF, 0x3FF << 10, 0x3FF << 20, static_cast(0xC0000000) } } }; + XMGLOBALCONST XMVECTORI32 g_XMXorDec4 = { { { 0x200, 0x200 << 10, 0x200 << 20, 0 } } }; + XMGLOBALCONST XMVECTORF32 g_XMAddUDec4 = { { { 0, 0, 0, 32768.0f * 65536.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMAddDec4 = { { { -512.0f, -512.0f * 1024.0f, -512.0f * 1024.0f * 1024.0f, 0 } } }; + XMGLOBALCONST XMVECTORF32 g_XMMulDec4 = { { { 1.0f, 1.0f / 1024.0f, 1.0f / (1024.0f * 1024.0f), 1.0f / (1024.0f * 1024.0f * 1024.0f) } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskByte4 = { { { 0xFF, 0xFF00, 0xFF0000, 0xFF000000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMXorByte4 = { { { 0x80, 0x8000, 0x800000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORF32 g_XMAddByte4 = { { { -128.0f, -128.0f * 256.0f, -128.0f * 65536.0f, 0 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixUnsigned = { { { 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMMaxInt = { { { 65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMMaxUInt = { { { 65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMUnsignedFix = { { { 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMsrgbScale = { { { 12.92f, 12.92f, 12.92f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMsrgbA = { { { 0.055f, 0.055f, 0.055f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMsrgbA1 = { { { 1.055f, 1.055f, 1.055f, 1.0f } } }; + XMGLOBALCONST XMVECTORI32 g_XMExponentBias = { { { 127, 127, 127, 127 } } }; + XMGLOBALCONST XMVECTORI32 g_XMSubnormalExponent = { { { -126, -126, -126, -126 } } }; + XMGLOBALCONST XMVECTORI32 g_XMNumTrailing = { { { 23, 23, 23, 23 } } }; + XMGLOBALCONST XMVECTORI32 g_XMMinNormal = { { { 0x00800000, 0x00800000, 0x00800000, 0x00800000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMNegInfinity = { { { 0xFF800000, 0xFF800000, 0xFF800000, 0xFF800000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMNegQNaN = { { { 0xFFC00000, 0xFFC00000, 0xFFC00000, 0xFFC00000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMBin128 = { { { 0x43000000, 0x43000000, 0x43000000, 0x43000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMBinNeg150 = { { { 0xC3160000, 0xC3160000, 0xC3160000, 0xC3160000 } } }; + XMGLOBALCONST XMVECTORI32 g_XM253 = { { { 253, 253, 253, 253 } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst1 = { { { -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst2 = { { { +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst3 = { { { -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst4 = { { { +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst5 = { { { -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst6 = { { { +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst7 = { { { -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst0 = { { { +1.442693f, +1.442693f, +1.442693f, +1.442693f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst1 = { { { -0.721242f, -0.721242f, -0.721242f, -0.721242f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst2 = { { { +0.479384f, +0.479384f, +0.479384f, +0.479384f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst3 = { { { -0.350295f, -0.350295f, -0.350295f, -0.350295f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst4 = { { { +0.248590f, +0.248590f, +0.248590f, +0.248590f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst5 = { { { -0.145700f, -0.145700f, -0.145700f, -0.145700f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst6 = { { { +0.057148f, +0.057148f, +0.057148f, +0.057148f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst7 = { { { -0.010578f, -0.010578f, -0.010578f, -0.010578f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLgE = { { { +1.442695f, +1.442695f, +1.442695f, +1.442695f } } }; + XMGLOBALCONST XMVECTORF32 g_XMInvLgE = { { { +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLg10 = { { { +3.321928f, +3.321928f, +3.321928f, +3.321928f } } }; + XMGLOBALCONST XMVECTORF32 g_XMInvLg10 = { { { +3.010299956e-1f, +3.010299956e-1f, +3.010299956e-1f, +3.010299956e-1f } } }; + XMGLOBALCONST XMVECTORF32 g_UByteMax = { { { 255.0f, 255.0f, 255.0f, 255.0f } } }; + XMGLOBALCONST XMVECTORF32 g_ByteMin = { { { -127.0f, -127.0f, -127.0f, -127.0f } } }; + XMGLOBALCONST XMVECTORF32 g_ByteMax = { { { 127.0f, 127.0f, 127.0f, 127.0f } } }; + XMGLOBALCONST XMVECTORF32 g_ShortMin = { { { -32767.0f, -32767.0f, -32767.0f, -32767.0f } } }; + XMGLOBALCONST XMVECTORF32 g_ShortMax = { { { 32767.0f, 32767.0f, 32767.0f, 32767.0f } } }; + XMGLOBALCONST XMVECTORF32 g_UShortMax = { { { 65535.0f, 65535.0f, 65535.0f, 65535.0f } } }; + + /**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4068 4214 4204 4365 4616 4640 6001 6101) + // C4068/4616: ignore unknown pragmas + // C4214/4204: nonstandard extension used + // C4365/4640: Off by default noise + // C6001/6101: False positives +#endif + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") +#pragma prefast(disable : 26495, "Union initialization confuses /analyze") +#endif + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wfloat-equal" +#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast" +#pragma clang diagnostic ignored "-Wunknown-warning-option" +#pragma clang diagnostic ignored "-Wunsafe-buffer-usage" +#endif + +//------------------------------------------------------------------------------ + + inline XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3) noexcept + { + #if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = (0 - (C0 & 1)) & 0x3F800000; + vResult.u[1] = (0 - (C1 & 1)) & 0x3F800000; + vResult.u[2] = (0 - (C2 & 1)) & 0x3F800000; + vResult.u[3] = (0 - (C3 & 1)) & 0x3F800000; + return vResult.v; + #elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = (0 - (C0 & 1)) & 0x3F800000; + vResult.u[1] = (0 - (C1 & 1)) & 0x3F800000; + vResult.u[2] = (0 - (C2 & 1)) & 0x3F800000; + vResult.u[3] = (0 - (C3 & 1)) & 0x3F800000; + return vResult.v; + #else // XM_SSE_INTRINSICS_ + static const XMVECTORU32 g_vMask1 = { { { 1, 1, 1, 1 } } }; + // Move the parms to a vector + __m128i vTemp = _mm_set_epi32(static_cast(C3), static_cast(C2), static_cast(C1), static_cast(C0)); + // Mask off the low bits + vTemp = _mm_and_si128(vTemp, g_vMask1); + // 0xFFFFFFFF on true bits + vTemp = _mm_cmpeq_epi32(vTemp, g_vMask1); + // 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f + vTemp = _mm_and_si128(vTemp, g_XMOne); + return _mm_castsi128_ps(vTemp); + #endif + } + + //------------------------------------------------------------------------------ + + inline XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent) noexcept + { + assert(IntConstant >= -16 && IntConstant <= 15); + assert(DivExponent < 32); + #if defined(_XM_NO_INTRINSICS_) + + using DirectX::XMConvertVectorIntToFloat; + + XMVECTORI32 V = { { { IntConstant, IntConstant, IntConstant, IntConstant } } }; + return XMConvertVectorIntToFloat(V.v, DivExponent); + + #elif defined(_XM_ARM_NEON_INTRINSICS_) + // Splat the int + int32x4_t vScale = vdupq_n_s32(IntConstant); + // Convert to a float + XMVECTOR vResult = vcvtq_f32_s32(vScale); + // Convert DivExponent into 1.0f/(1<(&vScale)[0]); + return vResult; + #else // XM_SSE_INTRINSICS_ + // Splat the int + __m128i vScale = _mm_set1_epi32(IntConstant); + // Convert to a float + XMVECTOR vResult = _mm_cvtepi32_ps(vScale); + // Convert DivExponent into 1.0f/(1<(uScale)); + // Multiply by the reciprocal (Perform a right shift by DivExponent) + vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(vScale)); + return vResult; + #endif + } + + //------------------------------------------------------------------------------ + + inline XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant) noexcept + { + assert(IntConstant >= -16 && IntConstant <= 15); + #if defined(_XM_NO_INTRINSICS_) + + XMVECTORI32 V = { { { IntConstant, IntConstant, IntConstant, IntConstant } } }; + return V.v; + + #elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t V = vdupq_n_s32(IntConstant); + return reinterpret_cast(&V)[0]; + #else // XM_SSE_INTRINSICS_ + __m128i V = _mm_set1_epi32(IntConstant); + return _mm_castsi128_ps(V); + #endif + } + +#include "directxmathconvert.inl" +#include "directxmathvector.inl" +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif +#include "directxmathmatrix.inl" +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif +#include "directxmathmisc.inl" + +#ifdef __clang__ +#pragma clang diagnostic pop +#endif +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif diff --git a/include/directxmath/directxmathconvert.inl b/include/directxmath/directxmathconvert.inl new file mode 100644 index 0000000..69218dc --- /dev/null +++ b/include/directxmath/directxmathconvert.inl @@ -0,0 +1,2201 @@ +//------------------------------------------------------------------------------------- +// DirectXMathConvert.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// https://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Data conversion + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4701) +// C4701: false positives +#endif + +inline XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat +( + FXMVECTOR VInt, + uint32_t DivExponent +) noexcept +{ + assert(DivExponent < 32); +#if defined(_XM_NO_INTRINSICS_) + float fScale = 1.0f / static_cast(1U << DivExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do + { + auto iTemp = static_cast(VInt.vector4_u32[ElementIndex]); + Result.vector4_f32[ElementIndex] = static_cast(iTemp)* fScale; + } + while (++ElementIndex < 4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fScale = 1.0f / static_cast(1U << DivExponent); + float32x4_t vResult = vcvtq_f32_s32(vreinterpretq_s32_f32(VInt)); + return vmulq_n_f32(vResult, fScale); +#else // _XM_SSE_INTRINSICS_ + // Convert to floats + XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt)); + // Convert DivExponent into 1.0f/(1<(uScale)); + vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(vScale)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt +( + FXMVECTOR VFloat, + uint32_t MulExponent +) noexcept +{ + assert(MulExponent < 32); +#if defined(_XM_NO_INTRINSICS_) + // Get the scalar factor. + auto fScale = static_cast(1U << MulExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do + { + int32_t iResult; + float fTemp = VFloat.vector4_f32[ElementIndex] * fScale; + if (fTemp <= -(65536.0f * 32768.0f)) + { + iResult = (-0x7FFFFFFF) - 1; + } + else if (fTemp > (65536.0f * 32768.0f) - 128.0f) + { + iResult = 0x7FFFFFFF; + } + else + { + iResult = static_cast(fTemp); + } + Result.vector4_u32[ElementIndex] = static_cast(iResult); + } + while (++ElementIndex < 4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmulq_n_f32(VFloat, static_cast(1U << MulExponent)); + // In case of positive overflow, detect it + uint32x4_t vOverflow = vcgtq_f32(vResult, g_XMMaxInt); + // Float to int conversion + int32x4_t vResulti = vcvtq_s32_f32(vResult); + // If there was positive overflow, set to 0x7FFFFFFF + vResult = vreinterpretq_f32_u32(vandq_u32(vOverflow, g_XMAbsMask)); + vOverflow = vbicq_u32(vreinterpretq_u32_s32(vResulti), vOverflow); + vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult)); + return vreinterpretq_f32_u32(vOverflow); +#else // _XM_SSE_INTRINSICS_ + XMVECTOR vResult = _mm_set_ps1(static_cast(1U << MulExponent)); + vResult = _mm_mul_ps(vResult, VFloat); + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(vResult); + // If there was positive overflow, set to 0x7FFFFFFF + vResult = _mm_and_ps(vOverflow, g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow, vResult); + return vOverflow; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat +( + FXMVECTOR VUInt, + uint32_t DivExponent +) noexcept +{ + assert(DivExponent < 32); +#if defined(_XM_NO_INTRINSICS_) + float fScale = 1.0f / static_cast(1U << DivExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do + { + Result.vector4_f32[ElementIndex] = static_cast(VUInt.vector4_u32[ElementIndex])* fScale; + } + while (++ElementIndex < 4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fScale = 1.0f / static_cast(1U << DivExponent); + float32x4_t vResult = vcvtq_f32_u32(vreinterpretq_u32_f32(VUInt)); + return vmulq_n_f32(vResult, fScale); +#else // _XM_SSE_INTRINSICS_ + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(VUInt, g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(VUInt, vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned); + vResult = _mm_add_ps(vResult, vMask); + // Convert DivExponent into 1.0f/(1<(uScale)); + vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(iMask)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt +( + FXMVECTOR VFloat, + uint32_t MulExponent +) noexcept +{ + assert(MulExponent < 32); +#if defined(_XM_NO_INTRINSICS_) + // Get the scalar factor. + auto fScale = static_cast(1U << MulExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do + { + uint32_t uResult; + float fTemp = VFloat.vector4_f32[ElementIndex] * fScale; + if (fTemp <= 0.0f) + { + uResult = 0; + } + else if (fTemp >= (65536.0f * 65536.0f)) + { + uResult = 0xFFFFFFFFU; + } + else + { + uResult = static_cast(fTemp); + } + Result.vector4_u32[ElementIndex] = uResult; + } + while (++ElementIndex < 4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmulq_n_f32(VFloat, static_cast(1U << MulExponent)); + // In case of overflow, detect it + uint32x4_t vOverflow = vcgtq_f32(vResult, g_XMMaxUInt); + // Float to int conversion + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + // If there was overflow, set to 0xFFFFFFFFU + vResult = vreinterpretq_f32_u32(vbicq_u32(vResulti, vOverflow)); + vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult)); + return vreinterpretq_f32_u32(vOverflow); +#else // _XM_SSE_INTRINSICS_ + XMVECTOR vResult = _mm_set_ps1(static_cast(1U << MulExponent)); + vResult = _mm_mul_ps(vResult, VFloat); + // Clamp to >=0 + vResult = _mm_max_ps(vResult, g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue, vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult, vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask, g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult, vOverflow); + return vResult; +#endif +} + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +/**************************************************************************** + * + * Vector and matrix load operations + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt(const uint32_t* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = *pSource; + V.vector4_u32[1] = 0; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t zero = vdupq_n_u32(0); + return vreinterpretq_f32_u32(vld1q_lane_u32(pSource, zero, 0)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ss(reinterpret_cast(pSource)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat(const float* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = *pSource; + V.vector4_f32[1] = 0.f; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t zero = vdupq_n_f32(0); + return vld1q_lane_f32(pSource, zero, 0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ss(pSource); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt2(const uint32_t* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32(pSource); + uint32x2_t zero = vdup_n_u32(0); + return vreinterpretq_f32_u32(vcombine_u32(x, zero)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt2A(const uint32_t* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + uint32x2_t x = vld1_u32_ex(pSource, 64); +#else + uint32x2_t x = vld1_u32(pSource); +#endif + uint32x2_t zero = vdup_n_u32(0); + return vreinterpretq_f32_u32(vcombine_u32(x, zero)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat2(const XMFLOAT2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t x = vld1_f32(reinterpret_cast(pSource)); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32(x, zero); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat2A(const XMFLOAT2A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + float32x2_t x = vld1_f32_ex(reinterpret_cast(pSource), 64); +#else + float32x2_t x = vld1_f32(reinterpret_cast(pSource)); +#endif + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32(x, zero); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadSInt2(const XMINT2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x2_t x = vld1_s32(reinterpret_cast(pSource)); + float32x2_t v = vcvt_f32_s32(x); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32(v, zero); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 V = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + return _mm_cvtepi32_ps(_mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUInt2(const XMUINT2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32(reinterpret_cast(pSource)); + float32x2_t v = vcvt_f32_u32(x); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32(v, zero); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 V = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V, g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V, vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned); + vResult = _mm_add_ps(vResult, vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt3(const uint32_t* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32(pSource); + uint32x2_t zero = vdup_n_u32(0); + uint32x2_t y = vld1_lane_u32(pSource + 2, zero, 0); + return vreinterpretq_f32_u32(vcombine_u32(x, y)); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(pSource + 2)); + return _mm_insert_ps(xy, z, 0x20); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(pSource + 2)); + return _mm_movelh_ps(xy, z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt3A(const uint32_t* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Reads an extra integer which is zero'd +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + uint32x4_t V = vld1q_u32_ex(pSource, 128); +#else + uint32x4_t V = vld1q_u32(pSource); +#endif + return vreinterpretq_f32_u32(vsetq_lane_u32(0, V, 3)); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(pSource + 2)); + return _mm_insert_ps(xy, z, 0x20); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(pSource + 2)); + return _mm_movelh_ps(xy, z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3(const XMFLOAT3* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t x = vld1_f32(reinterpret_cast(pSource)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t y = vld1_lane_f32(reinterpret_cast(pSource) + 2, zero, 0); + return vcombine_f32(x, y); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(&pSource->z); + return _mm_insert_ps(xy, z, 0x20); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(&pSource->z); + return _mm_movelh_ps(xy, z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3A(const XMFLOAT3A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Reads an extra float which is zero'd +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + float32x4_t V = vld1q_f32_ex(reinterpret_cast(pSource), 128); +#else + float32x4_t V = vld1q_f32(reinterpret_cast(pSource)); +#endif + return vsetq_lane_f32(0, V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + // Reads an extra float which is zero'd + __m128 V = _mm_load_ps(&pSource->x); + return _mm_and_ps(V, g_XMMask3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadSInt3(const XMINT3* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = 0.f; + return V; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x2_t x = vld1_s32(reinterpret_cast(pSource)); + int32x2_t zero = vdup_n_s32(0); + int32x2_t y = vld1_lane_s32(reinterpret_cast(pSource) + 2, zero, 0); + int32x4_t v = vcombine_s32(x, y); + return vcvtq_f32_s32(v); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(&pSource->z)); + __m128 V = _mm_movelh_ps(xy, z); + return _mm_cvtepi32_ps(_mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUInt3(const XMUINT3* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32(reinterpret_cast(pSource)); + uint32x2_t zero = vdup_n_u32(0); + uint32x2_t y = vld1_lane_u32(reinterpret_cast(pSource) + 2, zero, 0); + uint32x4_t v = vcombine_u32(x, y); + return vcvtq_f32_u32(v); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(&pSource->z)); + __m128 V = _mm_movelh_ps(xy, z); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V, g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V, vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned); + vResult = _mm_add_ps(vResult, vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt4(const uint32_t* pSource) noexcept +{ + assert(pSource); + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_u32(pSource)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128(reinterpret_cast(pSource)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt4A(const uint32_t* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + return vld1q_u32_ex(pSource, 128); +#else + return vreinterpretq_f32_u32(vld1q_u32(pSource)); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_load_si128(reinterpret_cast(pSource)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat4(const XMFLOAT4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_f32(reinterpret_cast(pSource)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_loadu_ps(&pSource->x); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat4A(const XMFLOAT4A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + return vld1q_f32_ex(reinterpret_cast(pSource), 128); +#else + return vld1q_f32(reinterpret_cast(pSource)); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps(&pSource->x); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadSInt4(const XMINT4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = static_cast(pSource->w); + return V; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t v = vld1q_s32(reinterpret_cast(pSource)); + return vcvtq_f32_s32(v); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128(reinterpret_cast(pSource)); + return _mm_cvtepi32_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUInt4(const XMUINT4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = static_cast(pSource->w); + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t v = vld1q_u32(reinterpret_cast(pSource)); + return vcvtq_f32_u32(v); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128(reinterpret_cast(pSource)); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V), g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V), vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned); + vResult = _mm_add_ps(vResult, vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat3x3(const XMFLOAT3X3* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + M.r[3].vector4_f32[0] = 0.0f; + M.r[3].vector4_f32[1] = 0.0f; + M.r[3].vector4_f32[2] = 0.0f; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t v0 = vld1q_f32(&pSource->m[0][0]); + float32x4_t v1 = vld1q_f32(&pSource->m[1][1]); + float32x2_t v2 = vcreate_f32(static_cast(*reinterpret_cast(&pSource->m[2][2]))); + float32x4_t T = vextq_f32(v0, v1, 3); + + XMMATRIX M; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T), g_XMMask3)); + M.r[2] = vcombine_f32(vget_high_f32(v1), v2); + M.r[3] = g_XMIdentityR3; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 Z = _mm_setzero_ps(); + + __m128 V1 = _mm_loadu_ps(&pSource->m[0][0]); + __m128 V2 = _mm_loadu_ps(&pSource->m[1][1]); + __m128 V3 = _mm_load_ss(&pSource->m[2][2]); + + __m128 T1 = _mm_unpackhi_ps(V1, Z); + __m128 T2 = _mm_unpacklo_ps(V2, Z); + __m128 T3 = _mm_shuffle_ps(V3, T2, _MM_SHUFFLE(0, 1, 0, 0)); + __m128 T4 = _mm_movehl_ps(T2, T3); + __m128 T5 = _mm_movehl_ps(Z, T1); + + XMMATRIX M; + M.r[0] = _mm_movelh_ps(V1, T1); + M.r[1] = _mm_add_ps(T4, T5); + M.r[2] = _mm_shuffle_ps(V2, V3, _MM_SHUFFLE(1, 0, 3, 2)); + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x3(const XMFLOAT4X3* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t v0 = vld1q_f32(&pSource->m[0][0]); + float32x4_t v1 = vld1q_f32(&pSource->m[1][1]); + float32x4_t v2 = vld1q_f32(&pSource->m[2][2]); + + float32x4_t T1 = vextq_f32(v0, v1, 3); + float32x4_t T2 = vcombine_f32(vget_high_f32(v1), vget_low_f32(v2)); + float32x4_t T3 = vextq_f32(v2, v2, 1); + + XMMATRIX M; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3)); + M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); + M.r[3] = vsetq_lane_f32(1.f, T3, 3); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + // Use unaligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1, g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2, g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3, g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A(const XMFLOAT4X3A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + float32x4_t v0 = vld1q_f32_ex(&pSource->m[0][0], 128); + float32x4_t v1 = vld1q_f32_ex(&pSource->m[1][1], 128); + float32x4_t v2 = vld1q_f32_ex(&pSource->m[2][2], 128); +#else + float32x4_t v0 = vld1q_f32(&pSource->m[0][0]); + float32x4_t v1 = vld1q_f32(&pSource->m[1][1]); + float32x4_t v2 = vld1q_f32(&pSource->m[2][2]); +#endif + + float32x4_t T1 = vextq_f32(v0, v1, 3); + float32x4_t T2 = vcombine_f32(vget_high_f32(v1), vget_low_f32(v2)); + float32x4_t T3 = vextq_f32(v2, v2, 1); + + XMMATRIX M; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3)); + M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); + M.r[3] = vsetq_lane_f32(1.f, T3, 3); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + // Use aligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1, g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2, g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3, g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat3x4(const XMFLOAT3X4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[1][0]; + M.r[0].vector4_f32[2] = pSource->m[2][0]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[0][1]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[2][1]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[0][2]; + M.r[2].vector4_f32[1] = pSource->m[1][2]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[0][3]; + M.r[3].vector4_f32[1] = pSource->m[1][3]; + M.r[3].vector4_f32[2] = pSource->m[2][3]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2x4_t vTemp0 = vld4_f32(&pSource->_11); + float32x4_t vTemp1 = vld1q_f32(&pSource->_31); + + float32x2_t l = vget_low_f32(vTemp1); + float32x4_t T0 = vcombine_f32(vTemp0.val[0], l); + float32x2_t rl = vrev64_f32(l); + float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl); + + float32x2_t h = vget_high_f32(vTemp1); + float32x4_t T2 = vcombine_f32(vTemp0.val[2], h); + float32x2_t rh = vrev64_f32(h); + float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh); + + XMMATRIX M = {}; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3)); + M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); + M.r[3] = vsetq_lane_f32(1.f, T3, 3); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_loadu_ps(&pSource->_11); + M.r[1] = _mm_loadu_ps(&pSource->_21); + M.r[2] = _mm_loadu_ps(&pSource->_31); + M.r[3] = g_XMIdentityR3; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + XMMATRIX mResult; + + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat3x4A(const XMFLOAT3X4A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[1][0]; + M.r[0].vector4_f32[2] = pSource->m[2][0]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[0][1]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[2][1]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[0][2]; + M.r[2].vector4_f32[1] = pSource->m[1][2]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[0][3]; + M.r[3].vector4_f32[1] = pSource->m[1][3]; + M.r[3].vector4_f32[2] = pSource->m[2][3]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + float32x2x4_t vTemp0 = vld4_f32_ex(&pSource->_11, 128); + float32x4_t vTemp1 = vld1q_f32_ex(&pSource->_31, 128); +#else + float32x2x4_t vTemp0 = vld4_f32(&pSource->_11); + float32x4_t vTemp1 = vld1q_f32(&pSource->_31); +#endif + + float32x2_t l = vget_low_f32(vTemp1); + float32x4_t T0 = vcombine_f32(vTemp0.val[0], l); + float32x2_t rl = vrev64_f32(l); + float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl); + + float32x2_t h = vget_high_f32(vTemp1); + float32x4_t T2 = vcombine_f32(vTemp0.val[2], h); + float32x2_t rh = vrev64_f32(h); + float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh); + + XMMATRIX M = {}; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3)); + M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); + M.r[3] = vsetq_lane_f32(1.f, T3, 3); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_load_ps(&pSource->_11); + M.r[1] = _mm_load_ps(&pSource->_21); + M.r[2] = _mm_load_ps(&pSource->_31); + M.r[3] = g_XMIdentityR3; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + XMMATRIX mResult; + + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x4(const XMFLOAT4X4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = pSource->m[0][3]; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = pSource->m[1][3]; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = pSource->m[2][3]; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = pSource->m[3][3]; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vld1q_f32(reinterpret_cast(&pSource->_11)); + M.r[1] = vld1q_f32(reinterpret_cast(&pSource->_21)); + M.r[2] = vld1q_f32(reinterpret_cast(&pSource->_31)); + M.r[3] = vld1q_f32(reinterpret_cast(&pSource->_41)); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_loadu_ps(&pSource->_11); + M.r[1] = _mm_loadu_ps(&pSource->_21); + M.r[2] = _mm_loadu_ps(&pSource->_31); + M.r[3] = _mm_loadu_ps(&pSource->_41); + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x4A(const XMFLOAT4X4A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = pSource->m[0][3]; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = pSource->m[1][3]; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = pSource->m[2][3]; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = pSource->m[3][3]; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + M.r[0] = vld1q_f32_ex(reinterpret_cast(&pSource->_11), 128); + M.r[1] = vld1q_f32_ex(reinterpret_cast(&pSource->_21), 128); + M.r[2] = vld1q_f32_ex(reinterpret_cast(&pSource->_31), 128); + M.r[3] = vld1q_f32_ex(reinterpret_cast(&pSource->_41), 128); +#else + M.r[0] = vld1q_f32(reinterpret_cast(&pSource->_11)); + M.r[1] = vld1q_f32(reinterpret_cast(&pSource->_21)); + M.r[2] = vld1q_f32(reinterpret_cast(&pSource->_31)); + M.r[3] = vld1q_f32(reinterpret_cast(&pSource->_41)); +#endif + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_load_ps(&pSource->_11); + M.r[1] = _mm_load_ps(&pSource->_21); + M.r[2] = _mm_load_ps(&pSource->_31); + M.r[3] = _mm_load_ps(&pSource->_41); + return M; +#endif +} + +/**************************************************************************** + * + * Vector and matrix store operations + * + ****************************************************************************/ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + *pDestination = XMVectorGetIntX(V); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(pDestination, *reinterpret_cast(&V), 0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(reinterpret_cast(pDestination), V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat +( + float* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + *pDestination = XMVectorGetX(V); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(pDestination, V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(pDestination, V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt2 +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V)); + vst1_u32(pDestination, VL); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt2A +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V)); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1_u32_ex(pDestination, VL, 64); +#else + vst1_u32(pDestination, VL); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat2 +( + XMFLOAT2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + vst1_f32(reinterpret_cast(pDestination), VL); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat2A +( + XMFLOAT2A* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1_f32_ex(reinterpret_cast(pDestination), VL, 64); +#else + vst1_f32(reinterpret_cast(pDestination), VL); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreSInt2 +( + XMINT2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t v = vget_low_f32(V); + int32x2_t iv = vcvt_s32_f32(v); + vst1_s32(reinterpret_cast(pDestination), iv); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow, vResult); + // Write two ints + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(vOverflow)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUInt2 +( + XMUINT2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t v = vget_low_f32(V); + uint32x2_t iv = vcvt_u32_f32(v); + vst1_u32(reinterpret_cast(pDestination), iv); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue, vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult, vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask, g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult, vOverflow); + // Write two uints + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(vResult)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt3 +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V)); + vst1_u32(pDestination, VL); + vst1q_lane_u32(pDestination + 2, *reinterpret_cast(&V), 2); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); + __m128 z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(reinterpret_cast(&pDestination[2]), z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt3A +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V)); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1_u32_ex(pDestination, VL, 64); +#else + vst1_u32(pDestination, VL); +#endif + vst1q_lane_u32(pDestination + 2, *reinterpret_cast(&V), 2); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); + __m128 z = _mm_movehl_ps(V, V); + _mm_store_ss(reinterpret_cast(&pDestination[2]), z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3 +( + XMFLOAT3* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + vst1_f32(reinterpret_cast(pDestination), VL); + vst1q_lane_f32(reinterpret_cast(pDestination) + 2, V, 2); +#elif defined(_XM_SSE4_INTRINSICS_) + * reinterpret_cast(&pDestination->x) = _mm_extract_ps(V, 0); + *reinterpret_cast(&pDestination->y) = _mm_extract_ps(V, 1); + *reinterpret_cast(&pDestination->z) = _mm_extract_ps(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); + __m128 z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(&pDestination->z, z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3A +( + XMFLOAT3A* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1_f32_ex(reinterpret_cast(pDestination), VL, 64); +#else + vst1_f32(reinterpret_cast(pDestination), VL); +#endif + vst1q_lane_f32(reinterpret_cast(pDestination) + 2, V, 2); +#elif defined(_XM_SSE4_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); + *reinterpret_cast(&pDestination->z) = _mm_extract_ps(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); + __m128 z = _mm_movehl_ps(V, V); + _mm_store_ss(&pDestination->z, z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreSInt3 +( + XMINT3* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t v = vcvtq_s32_f32(V); + int32x2_t vL = vget_low_s32(v); + vst1_s32(reinterpret_cast(pDestination), vL); + vst1q_lane_s32(reinterpret_cast(pDestination) + 2, v, 2); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow, vResult); + // Write 3 uints + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(vOverflow)); + __m128 z = XM_PERMUTE_PS(vOverflow, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(reinterpret_cast(&pDestination->z), z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUInt3 +( + XMUINT3* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t v = vcvtq_u32_f32(V); + uint32x2_t vL = vget_low_u32(v); + vst1_u32(reinterpret_cast(pDestination), vL); + vst1q_lane_u32(reinterpret_cast(pDestination) + 2, v, 2); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue, vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult, vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask, g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult, vOverflow); + // Write 3 uints + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(vResult)); + __m128 z = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(reinterpret_cast(&pDestination->z), z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt4 +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_u32(pDestination, vreinterpretq_u32_f32(V)); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt4A +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1q_u32_ex(pDestination, V, 128); +#else + vst1q_u32(pDestination, vreinterpretq_u32_f32(V)); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4 +( + XMFLOAT4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32(reinterpret_cast(pDestination), V); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_ps(&pDestination->x, V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4A +( + XMFLOAT4A* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1q_f32_ex(reinterpret_cast(pDestination), V, 128); +#else + vst1q_f32(reinterpret_cast(pDestination), V); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ps(&pDestination->x, V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreSInt4 +( + XMINT4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); + pDestination->w = static_cast(V.vector4_f32[3]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t v = vcvtq_s32_f32(V); + vst1q_s32(reinterpret_cast(pDestination), v); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow, vResult); + _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUInt4 +( + XMUINT4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); + pDestination->w = static_cast(V.vector4_f32[3]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t v = vcvtq_u32_f32(V); + vst1q_u32(reinterpret_cast(pDestination), v); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue, vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult, vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask, g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult, vOverflow); + _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3x3 +( + XMFLOAT3X3* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1); + float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1); + vst1q_f32(&pDestination->m[0][0], T2); + + T1 = vextq_f32(M.r[1], M.r[1], 1); + T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2])); + vst1q_f32(&pDestination->m[1][1], T2); + + vst1q_lane_f32(&pDestination->m[2][2], M.r[2], 2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vWork = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(0, 0, 2, 2)); + vTemp1 = _mm_shuffle_ps(vTemp1, vWork, _MM_SHUFFLE(2, 0, 1, 0)); + _mm_storeu_ps(&pDestination->m[0][0], vTemp1); + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1)); + _mm_storeu_ps(&pDestination->m[1][1], vTemp2); + vTemp3 = XM_PERMUTE_PS(vTemp3, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(&pDestination->m[2][2], vTemp3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x3 +( + XMFLOAT4X3* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1); + float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1); + vst1q_f32(&pDestination->m[0][0], T2); + + T1 = vextq_f32(M.r[1], M.r[1], 1); + T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2])); + vst1q_f32(&pDestination->m[1][1], T2); + + T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0); + T2 = vextq_f32(T1, M.r[3], 3); + vst1q_f32(&pDestination->m[2][2], T2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vTemp4 = M.r[3]; + XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1)); + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(2, 2, 0, 0)); + vTemp1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(0, 2, 1, 0)); + vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(0, 0, 2, 2)); + vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 1, 2, 0)); + _mm_storeu_ps(&pDestination->m[0][0], vTemp1); + _mm_storeu_ps(&pDestination->m[1][1], vTemp2x); + _mm_storeu_ps(&pDestination->m[2][2], vTemp3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x3A +( + XMFLOAT4X3A* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1); + float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1); + vst1q_f32_ex(&pDestination->m[0][0], T2, 128); + + T1 = vextq_f32(M.r[1], M.r[1], 1); + T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2])); + vst1q_f32_ex(&pDestination->m[1][1], T2, 128); + + T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0); + T2 = vextq_f32(T1, M.r[3], 3); + vst1q_f32_ex(&pDestination->m[2][2], T2, 128); +#else + float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1); + float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1); + vst1q_f32(&pDestination->m[0][0], T2); + + T1 = vextq_f32(M.r[1], M.r[1], 1); + T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2])); + vst1q_f32(&pDestination->m[1][1], T2); + + T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0); + T2 = vextq_f32(T1, M.r[3], 3); + vst1q_f32(&pDestination->m[2][2], T2); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + // x1,y1,z1,w1 + XMVECTOR vTemp1 = M.r[0]; + // x2,y2,z2,w2 + XMVECTOR vTemp2 = M.r[1]; + // x3,y3,z3,w3 + XMVECTOR vTemp3 = M.r[2]; + // x4,y4,z4,w4 + XMVECTOR vTemp4 = M.r[3]; + // z1,z1,x2,y2 + XMVECTOR vTemp = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(1, 0, 2, 2)); + // y2,z2,x3,y3 (Final) + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1)); + // x1,y1,z1,x2 (Final) + vTemp1 = _mm_shuffle_ps(vTemp1, vTemp, _MM_SHUFFLE(2, 0, 1, 0)); + // z3,z3,x4,x4 + vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(0, 0, 2, 2)); + // z3,x4,y4,z4 (Final) + vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 1, 2, 0)); + // Store in 3 operations + _mm_store_ps(&pDestination->m[0][0], vTemp1); + _mm_store_ps(&pDestination->m[1][1], vTemp2); + _mm_store_ps(&pDestination->m[2][2], vTemp3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3x4 +( + XMFLOAT3X4* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[1].vector4_f32[0]; + pDestination->m[0][2] = M.r[2].vector4_f32[0]; + pDestination->m[0][3] = M.r[3].vector4_f32[0]; + + pDestination->m[1][0] = M.r[0].vector4_f32[1]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[2].vector4_f32[1]; + pDestination->m[1][3] = M.r[3].vector4_f32[1]; + + pDestination->m[2][0] = M.r[0].vector4_f32[2]; + pDestination->m[2][1] = M.r[1].vector4_f32[2]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); + float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); + + float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); + float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); + + vst1q_f32(&pDestination->m[0][0], T0.val[0]); + vst1q_f32(&pDestination->m[1][0], T0.val[1]); + vst1q_f32(&pDestination->m[2][0], T1.val[0]); +#elif defined(_XM_SSE_INTRINSICS_) + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + + // x.x,y.x,z.x,w.x + XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + + _mm_storeu_ps(&pDestination->m[0][0], r0); + _mm_storeu_ps(&pDestination->m[1][0], r1); + _mm_storeu_ps(&pDestination->m[2][0], r2); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3x4A +( + XMFLOAT3X4A* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[1].vector4_f32[0]; + pDestination->m[0][2] = M.r[2].vector4_f32[0]; + pDestination->m[0][3] = M.r[3].vector4_f32[0]; + + pDestination->m[1][0] = M.r[0].vector4_f32[1]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[2].vector4_f32[1]; + pDestination->m[1][3] = M.r[3].vector4_f32[1]; + + pDestination->m[2][0] = M.r[0].vector4_f32[2]; + pDestination->m[2][1] = M.r[1].vector4_f32[2]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); + float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); + + float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); + float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); + +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1q_f32_ex(&pDestination->m[0][0], T0.val[0], 128); + vst1q_f32_ex(&pDestination->m[1][0], T0.val[1], 128); + vst1q_f32_ex(&pDestination->m[2][0], T1.val[0], 128); +#else + vst1q_f32(&pDestination->m[0][0], T0.val[0]); + vst1q_f32(&pDestination->m[1][0], T0.val[1]); + vst1q_f32(&pDestination->m[2][0], T1.val[0]); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + + // x.x,y.x,z.x,w.x + XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + + _mm_store_ps(&pDestination->m[0][0], r0); + _mm_store_ps(&pDestination->m[1][0], r1); + _mm_store_ps(&pDestination->m[2][0], r2); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x4 +( + XMFLOAT4X4* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32(reinterpret_cast(&pDestination->_11), M.r[0]); + vst1q_f32(reinterpret_cast(&pDestination->_21), M.r[1]); + vst1q_f32(reinterpret_cast(&pDestination->_31), M.r[2]); + vst1q_f32(reinterpret_cast(&pDestination->_41), M.r[3]); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_ps(&pDestination->_11, M.r[0]); + _mm_storeu_ps(&pDestination->_21, M.r[1]); + _mm_storeu_ps(&pDestination->_31, M.r[2]); + _mm_storeu_ps(&pDestination->_41, M.r[3]); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x4A +( + XMFLOAT4X4A* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1q_f32_ex(reinterpret_cast(&pDestination->_11), M.r[0], 128); + vst1q_f32_ex(reinterpret_cast(&pDestination->_21), M.r[1], 128); + vst1q_f32_ex(reinterpret_cast(&pDestination->_31), M.r[2], 128); + vst1q_f32_ex(reinterpret_cast(&pDestination->_41), M.r[3], 128); +#else + vst1q_f32(reinterpret_cast(&pDestination->_11), M.r[0]); + vst1q_f32(reinterpret_cast(&pDestination->_21), M.r[1]); + vst1q_f32(reinterpret_cast(&pDestination->_31), M.r[2]); + vst1q_f32(reinterpret_cast(&pDestination->_41), M.r[3]); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ps(&pDestination->_11, M.r[0]); + _mm_store_ps(&pDestination->_21, M.r[1]); + _mm_store_ps(&pDestination->_31, M.r[2]); + _mm_store_ps(&pDestination->_41, M.r[3]); +#endif +} + diff --git a/include/directxmath/directxmathmatrix.inl b/include/directxmath/directxmathmatrix.inl new file mode 100644 index 0000000..5e96a33 --- /dev/null +++ b/include/directxmath/directxmathmatrix.inl @@ -0,0 +1,3554 @@ +//------------------------------------------------------------------------------------- +// DirectXMathMatrix.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// https://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Matrix + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +// Return true if any entry in the matrix is NaN +inline bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + size_t i = 16; + auto pWork = reinterpret_cast(&M.m[0][0]); + do + { + // Fetch value into integer unit + uint32_t uTest = pWork[0]; + // Remove sign + uTest &= 0x7FFFFFFFU; + // NaN is 0x7F800001 through 0x7FFFFFFF inclusive + uTest -= 0x7F800001U; + if (uTest < 0x007FFFFFU) + { + break; // NaN found + } + ++pWork; // Next entry + } + while (--i); + return (i != 0); // i == 0 if nothing matched +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Load in registers + float32x4_t vX = M.r[0]; + float32x4_t vY = M.r[1]; + float32x4_t vZ = M.r[2]; + float32x4_t vW = M.r[3]; + // Test themselves to check for NaN + uint32x4_t xmask = vmvnq_u32(vceqq_f32(vX, vX)); + uint32x4_t ymask = vmvnq_u32(vceqq_f32(vY, vY)); + uint32x4_t zmask = vmvnq_u32(vceqq_f32(vZ, vZ)); + uint32x4_t wmask = vmvnq_u32(vceqq_f32(vW, vW)); + // Or all the results + xmask = vorrq_u32(xmask, zmask); + ymask = vorrq_u32(ymask, wmask); + xmask = vorrq_u32(xmask, ymask); + // If any tested true, return true + uint8x8x2_t vTemp = vzip_u8( + vget_low_u8(vreinterpretq_u8_u32(xmask)), + vget_high_u8(vreinterpretq_u8_u32(xmask))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + return (r != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Load in registers + XMVECTOR vX = M.r[0]; + XMVECTOR vY = M.r[1]; + XMVECTOR vZ = M.r[2]; + XMVECTOR vW = M.r[3]; + // Test themselves to check for NaN + vX = _mm_cmpneq_ps(vX, vX); + vY = _mm_cmpneq_ps(vY, vY); + vZ = _mm_cmpneq_ps(vZ, vZ); + vW = _mm_cmpneq_ps(vW, vW); + // Or all the results + vX = _mm_or_ps(vX, vZ); + vY = _mm_or_ps(vY, vW); + vX = _mm_or_ps(vX, vY); + // If any tested true, return true + return (_mm_movemask_ps(vX) != 0); +#else +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +// Return true if any entry in the matrix is +/-INF +inline bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + size_t i = 16; + auto pWork = reinterpret_cast(&M.m[0][0]); + do + { + // Fetch value into integer unit + uint32_t uTest = pWork[0]; + // Remove sign + uTest &= 0x7FFFFFFFU; + // INF is 0x7F800000 + if (uTest == 0x7F800000U) + { + break; // INF found + } + ++pWork; // Next entry + } + while (--i); + return (i != 0); // i == 0 if nothing matched +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Load in registers + float32x4_t vX = M.r[0]; + float32x4_t vY = M.r[1]; + float32x4_t vZ = M.r[2]; + float32x4_t vW = M.r[3]; + // Mask off the sign bits + vX = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vX), g_XMAbsMask)); + vY = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vY), g_XMAbsMask)); + vZ = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vZ), g_XMAbsMask)); + vW = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vW), g_XMAbsMask)); + // Compare to infinity + uint32x4_t xmask = vceqq_f32(vX, g_XMInfinity); + uint32x4_t ymask = vceqq_f32(vY, g_XMInfinity); + uint32x4_t zmask = vceqq_f32(vZ, g_XMInfinity); + uint32x4_t wmask = vceqq_f32(vW, g_XMInfinity); + // Or the answers together + xmask = vorrq_u32(xmask, zmask); + ymask = vorrq_u32(ymask, wmask); + xmask = vorrq_u32(xmask, ymask); + // If any tested true, return true + uint8x8x2_t vTemp = vzip_u8( + vget_low_u8(vreinterpretq_u8_u32(xmask)), + vget_high_u8(vreinterpretq_u8_u32(xmask))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + return (r != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bits + XMVECTOR vTemp1 = _mm_and_ps(M.r[0], g_XMAbsMask); + XMVECTOR vTemp2 = _mm_and_ps(M.r[1], g_XMAbsMask); + XMVECTOR vTemp3 = _mm_and_ps(M.r[2], g_XMAbsMask); + XMVECTOR vTemp4 = _mm_and_ps(M.r[3], g_XMAbsMask); + // Compare to infinity + vTemp1 = _mm_cmpeq_ps(vTemp1, g_XMInfinity); + vTemp2 = _mm_cmpeq_ps(vTemp2, g_XMInfinity); + vTemp3 = _mm_cmpeq_ps(vTemp3, g_XMInfinity); + vTemp4 = _mm_cmpeq_ps(vTemp4, g_XMInfinity); + // Or the answers together + vTemp1 = _mm_or_ps(vTemp1, vTemp2); + vTemp3 = _mm_or_ps(vTemp3, vTemp4); + vTemp1 = _mm_or_ps(vTemp1, vTemp3); + // If any are infinity, the signs are true. + return (_mm_movemask_ps(vTemp1) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +// Return true if the XMMatrix is equal to identity +inline bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + // Use the integer pipeline to reduce branching to a minimum + auto pWork = reinterpret_cast(&M.m[0][0]); + // Convert 1.0f to zero and or them together + uint32_t uOne = pWork[0] ^ 0x3F800000U; + // Or all the 0.0f entries together + uint32_t uZero = pWork[1]; + uZero |= pWork[2]; + uZero |= pWork[3]; + // 2nd row + uZero |= pWork[4]; + uOne |= pWork[5] ^ 0x3F800000U; + uZero |= pWork[6]; + uZero |= pWork[7]; + // 3rd row + uZero |= pWork[8]; + uZero |= pWork[9]; + uOne |= pWork[10] ^ 0x3F800000U; + uZero |= pWork[11]; + // 4th row + uZero |= pWork[12]; + uZero |= pWork[13]; + uZero |= pWork[14]; + uOne |= pWork[15] ^ 0x3F800000U; + // If all zero entries are zero, the uZero==0 + uZero &= 0x7FFFFFFF; // Allow -0.0f + // If all 1.0f entries are 1.0f, then uOne==0 + uOne |= uZero; + return (uOne == 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t xmask = vceqq_f32(M.r[0], g_XMIdentityR0); + uint32x4_t ymask = vceqq_f32(M.r[1], g_XMIdentityR1); + uint32x4_t zmask = vceqq_f32(M.r[2], g_XMIdentityR2); + uint32x4_t wmask = vceqq_f32(M.r[3], g_XMIdentityR3); + xmask = vandq_u32(xmask, zmask); + ymask = vandq_u32(ymask, wmask); + xmask = vandq_u32(xmask, ymask); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(xmask)), vget_high_u8(vreinterpretq_u8_u32(xmask))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + return (r == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0], g_XMIdentityR0); + XMVECTOR vTemp2 = _mm_cmpeq_ps(M.r[1], g_XMIdentityR1); + XMVECTOR vTemp3 = _mm_cmpeq_ps(M.r[2], g_XMIdentityR2); + XMVECTOR vTemp4 = _mm_cmpeq_ps(M.r[3], g_XMIdentityR3); + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + vTemp3 = _mm_and_ps(vTemp3, vTemp4); + vTemp1 = _mm_and_ps(vTemp1, vTemp3); + return (_mm_movemask_ps(vTemp1) == 0x0f); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Perform a 4x4 matrix multiply by a 4x4 matrix +inline XMMATRIX XM_CALLCONV XMMatrixMultiply +( + FXMMATRIX M1, + CXMMATRIX M2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX mResult; + // Cache the invariants in registers + float x = M1.m[0][0]; + float y = M1.m[0][1]; + float z = M1.m[0][2]; + float w = M1.m[0][3]; + // Perform the operation on the first row + mResult.m[0][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w); + mResult.m[0][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w); + mResult.m[0][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w); + mResult.m[0][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w); + // Repeat for all the other rows + x = M1.m[1][0]; + y = M1.m[1][1]; + z = M1.m[1][2]; + w = M1.m[1][3]; + mResult.m[1][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w); + mResult.m[1][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w); + mResult.m[1][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w); + mResult.m[1][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w); + x = M1.m[2][0]; + y = M1.m[2][1]; + z = M1.m[2][2]; + w = M1.m[2][3]; + mResult.m[2][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w); + mResult.m[2][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w); + mResult.m[2][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w); + mResult.m[2][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w); + x = M1.m[3][0]; + y = M1.m[3][1]; + z = M1.m[3][2]; + w = M1.m[3][3]; + mResult.m[3][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w); + mResult.m[3][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w); + mResult.m[3][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w); + mResult.m[3][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w); + return mResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX mResult; + float32x2_t VL = vget_low_f32(M1.r[0]); + float32x2_t VH = vget_high_f32(M1.r[0]); + // Perform the operation on the first row + float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0); + float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1); + float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[0] = vaddq_f32(vZ, vW); + // Repeat for the other 3 rows + VL = vget_low_f32(M1.r[1]); + VH = vget_high_f32(M1.r[1]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[1] = vaddq_f32(vZ, vW); + VL = vget_low_f32(M1.r[2]); + VH = vget_high_f32(M1.r[2]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[2] = vaddq_f32(vZ, vW); + VL = vget_low_f32(M1.r[3]); + VH = vget_high_f32(M1.r[3]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[3] = vaddq_f32(vZ, vW); + return mResult; +#elif defined(_XM_AVX2_INTRINSICS_) + __m256 t0 = _mm256_castps128_ps256(M1.r[0]); + t0 = _mm256_insertf128_ps(t0, M1.r[1], 1); + __m256 t1 = _mm256_castps128_ps256(M1.r[2]); + t1 = _mm256_insertf128_ps(t1, M1.r[3], 1); + + __m256 u0 = _mm256_castps128_ps256(M2.r[0]); + u0 = _mm256_insertf128_ps(u0, M2.r[1], 1); + __m256 u1 = _mm256_castps128_ps256(M2.r[2]); + u1 = _mm256_insertf128_ps(u1, M2.r[3], 1); + + __m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00); + __m256 c0 = _mm256_mul_ps(a0, b0); + __m256 c1 = _mm256_mul_ps(a1, b0); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1)); + b0 = _mm256_permute2f128_ps(u0, u0, 0x11); + __m256 c2 = _mm256_fmadd_ps(a0, b0, c0); + __m256 c3 = _mm256_fmadd_ps(a1, b0, c1); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00); + __m256 c4 = _mm256_mul_ps(a0, b1); + __m256 c5 = _mm256_mul_ps(a1, b1); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3)); + b1 = _mm256_permute2f128_ps(u1, u1, 0x11); + __m256 c6 = _mm256_fmadd_ps(a0, b1, c4); + __m256 c7 = _mm256_fmadd_ps(a1, b1, c5); + + t0 = _mm256_add_ps(c2, c6); + t1 = _mm256_add_ps(c3, c7); + + XMMATRIX mResult; + mResult.r[0] = _mm256_castps256_ps128(t0); + mResult.r[1] = _mm256_extractf128_ps(t0, 1); + mResult.r[2] = _mm256_castps256_ps128(t1); + mResult.r[3] = _mm256_extractf128_ps(t1, 1); + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX mResult; + // Splat the component X,Y,Z then W +#if defined(_XM_AVX_INTRINSICS_) + XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 0); + XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 1); + XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 2); + XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 3); +#else + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + XMVECTOR vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + // Perform the operation on the first row + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + // Perform a binary add to reduce cumulative errors + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + mResult.r[0] = vX; + // Repeat for the other 3 rows +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 3); +#else + vW = M1.r[1]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + mResult.r[1] = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 3); +#else + vW = M1.r[2]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + mResult.r[2] = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 3); +#else + vW = M1.r[3]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + mResult.r[3] = vX; + return mResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose +( + FXMMATRIX M1, + CXMMATRIX M2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX mResult; + // Cache the invariants in registers + float x = M2.m[0][0]; + float y = M2.m[1][0]; + float z = M2.m[2][0]; + float w = M2.m[3][0]; + // Perform the operation on the first row + mResult.m[0][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w); + mResult.m[0][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w); + mResult.m[0][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w); + mResult.m[0][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w); + // Repeat for all the other rows + x = M2.m[0][1]; + y = M2.m[1][1]; + z = M2.m[2][1]; + w = M2.m[3][1]; + mResult.m[1][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w); + mResult.m[1][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w); + mResult.m[1][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w); + mResult.m[1][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w); + x = M2.m[0][2]; + y = M2.m[1][2]; + z = M2.m[2][2]; + w = M2.m[3][2]; + mResult.m[2][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w); + mResult.m[2][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w); + mResult.m[2][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w); + mResult.m[2][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w); + x = M2.m[0][3]; + y = M2.m[1][3]; + z = M2.m[2][3]; + w = M2.m[3][3]; + mResult.m[3][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w); + mResult.m[3][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w); + mResult.m[3][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w); + mResult.m[3][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w); + return mResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(M1.r[0]); + float32x2_t VH = vget_high_f32(M1.r[0]); + // Perform the operation on the first row + float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0); + float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1); + float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r0 = vaddq_f32(vZ, vW); + // Repeat for the other 3 rows + VL = vget_low_f32(M1.r[1]); + VH = vget_high_f32(M1.r[1]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r1 = vaddq_f32(vZ, vW); + VL = vget_low_f32(M1.r[2]); + VH = vget_high_f32(M1.r[2]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r2 = vaddq_f32(vZ, vW); + VL = vget_low_f32(M1.r[3]); + VH = vget_high_f32(M1.r[3]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r3 = vaddq_f32(vZ, vW); + + // Transpose result + float32x4x2_t P0 = vzipq_f32(r0, r2); + float32x4x2_t P1 = vzipq_f32(r1, r3); + + float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); + float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); + + XMMATRIX mResult; + mResult.r[0] = T0.val[0]; + mResult.r[1] = T0.val[1]; + mResult.r[2] = T1.val[0]; + mResult.r[3] = T1.val[1]; + return mResult; +#elif defined(_XM_AVX2_INTRINSICS_) + __m256 t0 = _mm256_castps128_ps256(M1.r[0]); + t0 = _mm256_insertf128_ps(t0, M1.r[1], 1); + __m256 t1 = _mm256_castps128_ps256(M1.r[2]); + t1 = _mm256_insertf128_ps(t1, M1.r[3], 1); + + __m256 u0 = _mm256_castps128_ps256(M2.r[0]); + u0 = _mm256_insertf128_ps(u0, M2.r[1], 1); + __m256 u1 = _mm256_castps128_ps256(M2.r[2]); + u1 = _mm256_insertf128_ps(u1, M2.r[3], 1); + + __m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00); + __m256 c0 = _mm256_mul_ps(a0, b0); + __m256 c1 = _mm256_mul_ps(a1, b0); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1)); + b0 = _mm256_permute2f128_ps(u0, u0, 0x11); + __m256 c2 = _mm256_fmadd_ps(a0, b0, c0); + __m256 c3 = _mm256_fmadd_ps(a1, b0, c1); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00); + __m256 c4 = _mm256_mul_ps(a0, b1); + __m256 c5 = _mm256_mul_ps(a1, b1); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3)); + b1 = _mm256_permute2f128_ps(u1, u1, 0x11); + __m256 c6 = _mm256_fmadd_ps(a0, b1, c4); + __m256 c7 = _mm256_fmadd_ps(a1, b1, c5); + + t0 = _mm256_add_ps(c2, c6); + t1 = _mm256_add_ps(c3, c7); + + // Transpose result + __m256 vTemp = _mm256_unpacklo_ps(t0, t1); + __m256 vTemp2 = _mm256_unpackhi_ps(t0, t1); + __m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); + __m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); + vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4); + vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4); + t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); + t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); + + XMMATRIX mResult; + mResult.r[0] = _mm256_castps256_ps128(t0); + mResult.r[1] = _mm256_extractf128_ps(t0, 1); + mResult.r[2] = _mm256_castps256_ps128(t1); + mResult.r[3] = _mm256_extractf128_ps(t1, 1); + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the component X,Y,Z then W +#if defined(_XM_AVX_INTRINSICS_) + XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 0); + XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 1); + XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 2); + XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 3); +#else + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + XMVECTOR vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + // Perform the operation on the first row + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + // Perform a binary add to reduce cumulative errors + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + XMVECTOR r0 = vX; + // Repeat for the other 3 rows +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 3); +#else + vW = M1.r[1]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + XMVECTOR r1 = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 3); +#else + vW = M1.r[2]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + XMVECTOR r2 = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 3); +#else + vW = M1.r[3]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + XMVECTOR r3 = vX; + + // Transpose result + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2)); + + XMMATRIX mResult; + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTranspose(FXMMATRIX M) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + // Original matrix: + // + // m00m01m02m03 + // m10m11m12m13 + // m20m21m22m23 + // m30m31m32m33 + + XMMATRIX P; + P.r[0] = XMVectorMergeXY(M.r[0], M.r[2]); // m00m20m01m21 + P.r[1] = XMVectorMergeXY(M.r[1], M.r[3]); // m10m30m11m31 + P.r[2] = XMVectorMergeZW(M.r[0], M.r[2]); // m02m22m03m23 + P.r[3] = XMVectorMergeZW(M.r[1], M.r[3]); // m12m32m13m33 + + XMMATRIX MT; + MT.r[0] = XMVectorMergeXY(P.r[0], P.r[1]); // m00m10m20m30 + MT.r[1] = XMVectorMergeZW(P.r[0], P.r[1]); // m01m11m21m31 + MT.r[2] = XMVectorMergeXY(P.r[2], P.r[3]); // m02m12m22m32 + MT.r[3] = XMVectorMergeZW(P.r[2], P.r[3]); // m03m13m23m33 + return MT; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); + float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); + + float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); + float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); + + XMMATRIX mResult; + mResult.r[0] = T0.val[0]; + mResult.r[1] = T0.val[1]; + mResult.r[2] = T1.val[0]; + mResult.r[3] = T1.val[1]; + return mResult; +#elif defined(_XM_AVX2_INTRINSICS_) + __m256 t0 = _mm256_castps128_ps256(M.r[0]); + t0 = _mm256_insertf128_ps(t0, M.r[1], 1); + __m256 t1 = _mm256_castps128_ps256(M.r[2]); + t1 = _mm256_insertf128_ps(t1, M.r[3], 1); + + __m256 vTemp = _mm256_unpacklo_ps(t0, t1); + __m256 vTemp2 = _mm256_unpackhi_ps(t0, t1); + __m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); + __m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); + vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4); + vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4); + t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); + t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); + + XMMATRIX mResult; + mResult.r[0] = _mm256_castps256_ps128(t0); + mResult.r[1] = _mm256_extractf128_ps(t0, 1); + mResult.r[2] = _mm256_castps256_ps128(t1); + mResult.r[3] = _mm256_extractf128_ps(t1, 1); + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + + XMMATRIX mResult; + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ +// Return the inverse and the determinant of a 4x4 matrix +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMMatrixInverse +( + XMVECTOR* pDeterminant, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMMATRIX MT = XMMatrixTranspose(M); + + XMVECTOR V0[4], V1[4]; + V0[0] = XMVectorSwizzle(MT.r[2]); + V1[0] = XMVectorSwizzle(MT.r[3]); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorSwizzle(MT.r[1]); + V0[2] = XMVectorPermute(MT.r[2], MT.r[0]); + V1[2] = XMVectorPermute(MT.r[3], MT.r[1]); + + XMVECTOR D0 = XMVectorMultiply(V0[0], V1[0]); + XMVECTOR D1 = XMVectorMultiply(V0[1], V1[1]); + XMVECTOR D2 = XMVectorMultiply(V0[2], V1[2]); + + V0[0] = XMVectorSwizzle(MT.r[2]); + V1[0] = XMVectorSwizzle(MT.r[3]); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorSwizzle(MT.r[1]); + V0[2] = XMVectorPermute(MT.r[2], MT.r[0]); + V1[2] = XMVectorPermute(MT.r[3], MT.r[1]); + + D0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], D0); + D1 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], D1); + D2 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], D2); + + V0[0] = XMVectorSwizzle(MT.r[1]); + V1[0] = XMVectorPermute(D0, D2); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorPermute(D0, D2); + V0[2] = XMVectorSwizzle(MT.r[3]); + V1[2] = XMVectorPermute(D1, D2); + V0[3] = XMVectorSwizzle(MT.r[2]); + V1[3] = XMVectorPermute(D1, D2); + + XMVECTOR C0 = XMVectorMultiply(V0[0], V1[0]); + XMVECTOR C2 = XMVectorMultiply(V0[1], V1[1]); + XMVECTOR C4 = XMVectorMultiply(V0[2], V1[2]); + XMVECTOR C6 = XMVectorMultiply(V0[3], V1[3]); + + V0[0] = XMVectorSwizzle(MT.r[1]); + V1[0] = XMVectorPermute(D0, D2); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorPermute(D0, D2); + V0[2] = XMVectorSwizzle(MT.r[3]); + V1[2] = XMVectorPermute(D1, D2); + V0[3] = XMVectorSwizzle(MT.r[2]); + V1[3] = XMVectorPermute(D1, D2); + + C0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); + C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); + C4 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); + C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); + + V0[0] = XMVectorSwizzle(MT.r[1]); + V1[0] = XMVectorPermute(D0, D2); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorPermute(D0, D2); + V0[2] = XMVectorSwizzle(MT.r[3]); + V1[2] = XMVectorPermute(D1, D2); + V0[3] = XMVectorSwizzle(MT.r[2]); + V1[3] = XMVectorPermute(D1, D2); + + XMVECTOR C1 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); + C0 = XMVectorMultiplyAdd(V0[0], V1[0], C0); + XMVECTOR C3 = XMVectorMultiplyAdd(V0[1], V1[1], C2); + C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); + XMVECTOR C5 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); + C4 = XMVectorMultiplyAdd(V0[2], V1[2], C4); + XMVECTOR C7 = XMVectorMultiplyAdd(V0[3], V1[3], C6); + C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); + + XMMATRIX R; + R.r[0] = XMVectorSelect(C0, C1, g_XMSelect0101.v); + R.r[1] = XMVectorSelect(C2, C3, g_XMSelect0101.v); + R.r[2] = XMVectorSelect(C4, C5, g_XMSelect0101.v); + R.r[3] = XMVectorSelect(C6, C7, g_XMSelect0101.v); + + XMVECTOR Determinant = XMVector4Dot(R.r[0], MT.r[0]); + + if (pDeterminant != nullptr) + *pDeterminant = Determinant; + + XMVECTOR Reciprocal = XMVectorReciprocal(Determinant); + + XMMATRIX Result; + Result.r[0] = XMVectorMultiply(R.r[0], Reciprocal); + Result.r[1] = XMVectorMultiply(R.r[1], Reciprocal); + Result.r[2] = XMVectorMultiply(R.r[2], Reciprocal); + Result.r[3] = XMVectorMultiply(R.r[3], Reciprocal); + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Transpose matrix + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + + XMMATRIX MT; + MT.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + MT.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + MT.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + MT.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + + XMVECTOR V00 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(1, 1, 0, 0)); + XMVECTOR V10 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + XMVECTOR V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1, 1, 0, 0)); + XMVECTOR V11 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0], _MM_SHUFFLE(2, 0, 2, 0)); + XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1], _MM_SHUFFLE(3, 1, 3, 1)); + + XMVECTOR D0 = _mm_mul_ps(V00, V10); + XMVECTOR D1 = _mm_mul_ps(V01, V11); + XMVECTOR D2 = _mm_mul_ps(V02, V12); + + V00 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(3, 2, 3, 2)); + V10 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1, 1, 0, 0)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(3, 2, 3, 2)); + V11 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1, 1, 0, 0)); + V02 = _mm_shuffle_ps(MT.r[2], MT.r[0], _MM_SHUFFLE(3, 1, 3, 1)); + V12 = _mm_shuffle_ps(MT.r[3], MT.r[1], _MM_SHUFFLE(2, 0, 2, 0)); + + D0 = XM_FNMADD_PS(V00, V10, D0); + D1 = XM_FNMADD_PS(V01, V11, D1); + D2 = XM_FNMADD_PS(V02, V12, D2); + // V11 = D0Y,D0W,D2Y,D2Y + V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 1, 3, 1)); + V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1, 0, 2, 1)); + V10 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(0, 3, 0, 2)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(0, 1, 0, 2)); + V11 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(2, 1, 2, 1)); + // V13 = D1Y,D1W,D2W,D2W + XMVECTOR V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 3, 3, 1)); + V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1, 0, 2, 1)); + V12 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(0, 3, 0, 2)); + XMVECTOR V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(0, 1, 0, 2)); + V13 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(2, 1, 2, 1)); + + XMVECTOR C0 = _mm_mul_ps(V00, V10); + XMVECTOR C2 = _mm_mul_ps(V01, V11); + XMVECTOR C4 = _mm_mul_ps(V02, V12); + XMVECTOR C6 = _mm_mul_ps(V03, V13); + + // V11 = D0X,D0Y,D2X,D2X + V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(0, 0, 1, 0)); + V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(2, 1, 3, 2)); + V10 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(2, 1, 0, 3)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1, 3, 2, 3)); + V11 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(0, 2, 1, 2)); + // V13 = D1X,D1Y,D2Z,D2Z + V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(2, 2, 1, 0)); + V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(2, 1, 3, 2)); + V12 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(2, 1, 0, 3)); + V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(1, 3, 2, 3)); + V13 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(0, 2, 1, 2)); + + C0 = XM_FNMADD_PS(V00, V10, C0); + C2 = XM_FNMADD_PS(V01, V11, C2); + C4 = XM_FNMADD_PS(V02, V12, C4); + C6 = XM_FNMADD_PS(V03, V13, C6); + + V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(0, 3, 0, 3)); + // V10 = D0Z,D0Z,D2X,D2Y + V10 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 2, 2)); + V10 = XM_PERMUTE_PS(V10, _MM_SHUFFLE(0, 2, 3, 0)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(2, 0, 3, 1)); + // V11 = D0X,D0W,D2X,D2Y + V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 3, 0)); + V11 = XM_PERMUTE_PS(V11, _MM_SHUFFLE(2, 1, 0, 3)); + V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(0, 3, 0, 3)); + // V12 = D1Z,D1Z,D2Z,D2W + V12 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 2, 2)); + V12 = XM_PERMUTE_PS(V12, _MM_SHUFFLE(0, 2, 3, 0)); + V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(2, 0, 3, 1)); + // V13 = D1X,D1W,D2Z,D2W + V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 3, 0)); + V13 = XM_PERMUTE_PS(V13, _MM_SHUFFLE(2, 1, 0, 3)); + + V00 = _mm_mul_ps(V00, V10); + V01 = _mm_mul_ps(V01, V11); + V02 = _mm_mul_ps(V02, V12); + V03 = _mm_mul_ps(V03, V13); + XMVECTOR C1 = _mm_sub_ps(C0, V00); + C0 = _mm_add_ps(C0, V00); + XMVECTOR C3 = _mm_add_ps(C2, V01); + C2 = _mm_sub_ps(C2, V01); + XMVECTOR C5 = _mm_sub_ps(C4, V02); + C4 = _mm_add_ps(C4, V02); + XMVECTOR C7 = _mm_add_ps(C6, V03); + C6 = _mm_sub_ps(C6, V03); + + C0 = _mm_shuffle_ps(C0, C1, _MM_SHUFFLE(3, 1, 2, 0)); + C2 = _mm_shuffle_ps(C2, C3, _MM_SHUFFLE(3, 1, 2, 0)); + C4 = _mm_shuffle_ps(C4, C5, _MM_SHUFFLE(3, 1, 2, 0)); + C6 = _mm_shuffle_ps(C6, C7, _MM_SHUFFLE(3, 1, 2, 0)); + C0 = XM_PERMUTE_PS(C0, _MM_SHUFFLE(3, 1, 2, 0)); + C2 = XM_PERMUTE_PS(C2, _MM_SHUFFLE(3, 1, 2, 0)); + C4 = XM_PERMUTE_PS(C4, _MM_SHUFFLE(3, 1, 2, 0)); + C6 = XM_PERMUTE_PS(C6, _MM_SHUFFLE(3, 1, 2, 0)); + // Get the determinant + XMVECTOR vTemp = XMVector4Dot(C0, MT.r[0]); + if (pDeterminant != nullptr) + *pDeterminant = vTemp; + vTemp = _mm_div_ps(g_XMOne, vTemp); + XMMATRIX mResult; + mResult.r[0] = _mm_mul_ps(C0, vTemp); + mResult.r[1] = _mm_mul_ps(C2, vTemp); + mResult.r[2] = _mm_mul_ps(C4, vTemp); + mResult.r[3] = _mm_mul_ps(C6, vTemp); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixVectorTensorProduct +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + XMMATRIX mResult; + mResult.r[0] = XMVectorMultiply(XMVectorSwizzle<0, 0, 0, 0>(V1), V2); + mResult.r[1] = XMVectorMultiply(XMVectorSwizzle<1, 1, 1, 1>(V1), V2); + mResult.r[2] = XMVectorMultiply(XMVectorSwizzle<2, 2, 2, 2>(V1), V2); + mResult.r[3] = XMVectorMultiply(XMVectorSwizzle<3, 3, 3, 3>(V1), V2); + return mResult; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMMatrixDeterminant(FXMMATRIX M) noexcept +{ + static const XMVECTORF32 Sign = { { { 1.0f, -1.0f, 1.0f, -1.0f } } }; + + XMVECTOR V0 = XMVectorSwizzle(M.r[2]); + XMVECTOR V1 = XMVectorSwizzle(M.r[3]); + XMVECTOR V2 = XMVectorSwizzle(M.r[2]); + XMVECTOR V3 = XMVectorSwizzle(M.r[3]); + XMVECTOR V4 = XMVectorSwizzle(M.r[2]); + XMVECTOR V5 = XMVectorSwizzle(M.r[3]); + + XMVECTOR P0 = XMVectorMultiply(V0, V1); + XMVECTOR P1 = XMVectorMultiply(V2, V3); + XMVECTOR P2 = XMVectorMultiply(V4, V5); + + V0 = XMVectorSwizzle(M.r[2]); + V1 = XMVectorSwizzle(M.r[3]); + V2 = XMVectorSwizzle(M.r[2]); + V3 = XMVectorSwizzle(M.r[3]); + V4 = XMVectorSwizzle(M.r[2]); + V5 = XMVectorSwizzle(M.r[3]); + + P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0); + P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1); + P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2); + + V0 = XMVectorSwizzle(M.r[1]); + V1 = XMVectorSwizzle(M.r[1]); + V2 = XMVectorSwizzle(M.r[1]); + + XMVECTOR S = XMVectorMultiply(M.r[0], Sign.v); + XMVECTOR R = XMVectorMultiply(V0, P0); + R = XMVectorNegativeMultiplySubtract(V1, P1, R); + R = XMVectorMultiplyAdd(V2, P2, R); + + return XMVector4Dot(S, R); +} + +#define XM3RANKDECOMPOSE(a, b, c, x, y, z) \ + if((x) < (y)) \ + { \ + if((y) < (z)) \ + { \ + (a) = 2; \ + (b) = 1; \ + (c) = 0; \ + } \ + else \ + { \ + (a) = 1; \ + \ + if((x) < (z)) \ + { \ + (b) = 2; \ + (c) = 0; \ + } \ + else \ + { \ + (b) = 0; \ + (c) = 2; \ + } \ + } \ + } \ + else \ + { \ + if((x) < (z)) \ + { \ + (a) = 2; \ + (b) = 0; \ + (c) = 1; \ + } \ + else \ + { \ + (a) = 0; \ + \ + if((y) < (z)) \ + { \ + (b) = 2; \ + (c) = 1; \ + } \ + else \ + { \ + (b) = 1; \ + (c) = 2; \ + } \ + } \ + } + +#define XM3_DECOMP_EPSILON 0.0001f + +_Use_decl_annotations_ +inline bool XM_CALLCONV XMMatrixDecompose +( + XMVECTOR* outScale, + XMVECTOR* outRotQuat, + XMVECTOR* outTrans, + FXMMATRIX M +) noexcept +{ + static const XMVECTOR* pvCanonicalBasis[3] = { + &g_XMIdentityR0.v, + &g_XMIdentityR1.v, + &g_XMIdentityR2.v + }; + + assert(outScale != nullptr); + assert(outRotQuat != nullptr); + assert(outTrans != nullptr); + + // Get the translation + outTrans[0] = M.r[3]; + + XMVECTOR* ppvBasis[3]; + XMMATRIX matTemp; + ppvBasis[0] = &matTemp.r[0]; + ppvBasis[1] = &matTemp.r[1]; + ppvBasis[2] = &matTemp.r[2]; + + matTemp.r[0] = M.r[0]; + matTemp.r[1] = M.r[1]; + matTemp.r[2] = M.r[2]; + matTemp.r[3] = g_XMIdentityR3.v; + + auto pfScales = reinterpret_cast(outScale); + + size_t a, b, c; + XMVectorGetXPtr(&pfScales[0], XMVector3Length(ppvBasis[0][0])); + XMVectorGetXPtr(&pfScales[1], XMVector3Length(ppvBasis[1][0])); + XMVectorGetXPtr(&pfScales[2], XMVector3Length(ppvBasis[2][0])); + pfScales[3] = 0.f; + + XM3RANKDECOMPOSE(a, b, c, pfScales[0], pfScales[1], pfScales[2]) + + if (pfScales[a] < XM3_DECOMP_EPSILON) + { + ppvBasis[a][0] = pvCanonicalBasis[a][0]; + } + ppvBasis[a][0] = XMVector3Normalize(ppvBasis[a][0]); + + if (pfScales[b] < XM3_DECOMP_EPSILON) + { + size_t aa, bb, cc; + float fAbsX, fAbsY, fAbsZ; + + fAbsX = fabsf(XMVectorGetX(ppvBasis[a][0])); + fAbsY = fabsf(XMVectorGetY(ppvBasis[a][0])); + fAbsZ = fabsf(XMVectorGetZ(ppvBasis[a][0])); + + XM3RANKDECOMPOSE(aa, bb, cc, fAbsX, fAbsY, fAbsZ) + + ppvBasis[b][0] = XMVector3Cross(ppvBasis[a][0], pvCanonicalBasis[cc][0]); + } + + ppvBasis[b][0] = XMVector3Normalize(ppvBasis[b][0]); + + if (pfScales[c] < XM3_DECOMP_EPSILON) + { + ppvBasis[c][0] = XMVector3Cross(ppvBasis[a][0], ppvBasis[b][0]); + } + + ppvBasis[c][0] = XMVector3Normalize(ppvBasis[c][0]); + + float fDet = XMVectorGetX(XMMatrixDeterminant(matTemp)); + + // use Kramer's rule to check for handedness of coordinate system + if (fDet < 0.0f) + { + // switch coordinate system by negating the scale and inverting the basis vector on the x-axis + pfScales[a] = -pfScales[a]; + ppvBasis[a][0] = XMVectorNegate(ppvBasis[a][0]); + + fDet = -fDet; + } + + fDet -= 1.0f; + fDet *= fDet; + + if (XM3_DECOMP_EPSILON < fDet) + { + // Non-SRT matrix encountered + return false; + } + + // generate the quaternion from the matrix + outRotQuat[0] = XMQuaternionRotationMatrix(matTemp); + return true; +} + +#undef XM3_DECOMP_EPSILON +#undef XM3RANKDECOMPOSE + +//------------------------------------------------------------------------------ +// Transformation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixIdentity() noexcept +{ + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = g_XMIdentityR3.v; + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixSet +( + float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33 +) noexcept +{ + XMMATRIX M; +#if defined(_XM_NO_INTRINSICS_) + M.m[0][0] = m00; M.m[0][1] = m01; M.m[0][2] = m02; M.m[0][3] = m03; + M.m[1][0] = m10; M.m[1][1] = m11; M.m[1][2] = m12; M.m[1][3] = m13; + M.m[2][0] = m20; M.m[2][1] = m21; M.m[2][2] = m22; M.m[2][3] = m23; + M.m[3][0] = m30; M.m[3][1] = m31; M.m[3][2] = m32; M.m[3][3] = m33; +#else + M.r[0] = XMVectorSet(m00, m01, m02, m03); + M.r[1] = XMVectorSet(m10, m11, m12, m13); + M.r[2] = XMVectorSet(m20, m21, m22, m23); + M.r[3] = XMVectorSet(m30, m31, m32, m33); +#endif + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTranslation +( + float OffsetX, + float OffsetY, + float OffsetZ +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = OffsetX; + M.m[3][1] = OffsetY; + M.m[3][2] = OffsetZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = XMVectorSet(OffsetX, OffsetY, OffsetZ, 1.f); + return M; +#endif +} + + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector(FXMVECTOR Offset) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = Offset.vector4_f32[0]; + M.m[3][1] = Offset.vector4_f32[1]; + M.m[3][2] = Offset.vector4_f32[2]; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = XMVectorSelect(g_XMIdentityR3.v, Offset, g_XMSelect1110.v); + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixScaling +( + float ScaleX, + float ScaleY, + float ScaleZ +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = ScaleX; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ScaleY; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = ScaleZ; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(ScaleX, Zero, 0); + M.r[1] = vsetq_lane_f32(ScaleY, Zero, 1); + M.r[2] = vsetq_lane_f32(ScaleZ, Zero, 2); + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_set_ps(0, 0, 0, ScaleX); + M.r[1] = _mm_set_ps(0, 0, ScaleY, 0); + M.r[2] = _mm_set_ps(0, ScaleZ, 0, 0); + M.r[3] = g_XMIdentityR3.v; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = Scale.vector4_f32[0]; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Scale.vector4_f32[1]; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = Scale.vector4_f32[2]; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskX)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskY)); + M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskZ)); + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_and_ps(Scale, g_XMMaskX); + M.r[1] = _mm_and_ps(Scale, g_XMMaskY); + M.r[2] = _mm_and_ps(Scale, g_XMMaskZ); + M.r[3] = g_XMIdentityR3.v; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = fCosAngle; + M.m[1][2] = fSinAngle; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = -fSinAngle; + M.m[2][2] = fCosAngle; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const float32x4_t Zero = vdupq_n_f32(0); + + float32x4_t T1 = vsetq_lane_f32(fCosAngle, Zero, 1); + T1 = vsetq_lane_f32(fSinAngle, T1, 2); + + float32x4_t T2 = vsetq_lane_f32(-fSinAngle, Zero, 1); + T2 = vsetq_lane_f32(fCosAngle, T2, 2); + + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = T1; + M.r[2] = T2; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = 0,y = cos,z = sin, w = 0 + vCos = _mm_shuffle_ps(vCos, vSin, _MM_SHUFFLE(3, 0, 0, 3)); + XMMATRIX M; + M.r[0] = g_XMIdentityR0; + M.r[1] = vCos; + // x = 0,y = sin,z = cos, w = 0 + vCos = XM_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 1, 2, 0)); + // x = 0,y = -sin,z = cos, w = 0 + vCos = _mm_mul_ps(vCos, g_XMNegateY); + M.r[2] = vCos; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = fCosAngle; + M.m[0][1] = 0.0f; + M.m[0][2] = -fSinAngle; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = fSinAngle; + M.m[2][1] = 0.0f; + M.m[2][2] = fCosAngle; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const float32x4_t Zero = vdupq_n_f32(0); + + float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0); + T0 = vsetq_lane_f32(-fSinAngle, T0, 2); + + float32x4_t T2 = vsetq_lane_f32(fSinAngle, Zero, 0); + T2 = vsetq_lane_f32(fCosAngle, T2, 2); + + XMMATRIX M; + M.r[0] = T0; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = T2; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = sin,y = 0,z = cos, w = 0 + vSin = _mm_shuffle_ps(vSin, vCos, _MM_SHUFFLE(3, 0, 3, 0)); + XMMATRIX M; + M.r[2] = vSin; + M.r[1] = g_XMIdentityR1; + // x = cos,y = 0,z = sin, w = 0 + vSin = XM_PERMUTE_PS(vSin, _MM_SHUFFLE(3, 0, 1, 2)); + // x = cos,y = 0,z = -sin, w = 0 + vSin = _mm_mul_ps(vSin, g_XMNegateZ); + M.r[0] = vSin; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = fCosAngle; + M.m[0][1] = fSinAngle; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = -fSinAngle; + M.m[1][1] = fCosAngle; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const float32x4_t Zero = vdupq_n_f32(0); + + float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0); + T0 = vsetq_lane_f32(fSinAngle, T0, 1); + + float32x4_t T1 = vsetq_lane_f32(-fSinAngle, Zero, 0); + T1 = vsetq_lane_f32(fCosAngle, T1, 1); + + XMMATRIX M; + M.r[0] = T0; + M.r[1] = T1; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = cos,y = sin,z = 0, w = 0 + vCos = _mm_unpacklo_ps(vCos, vSin); + XMMATRIX M; + M.r[0] = vCos; + // x = sin,y = cos,z = 0, w = 0 + vCos = XM_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 2, 0, 1)); + // x = cos,y = -sin,z = 0, w = 0 + vCos = _mm_mul_ps(vCos, g_XMNegateX); + M.r[1] = vCos; + M.r[2] = g_XMIdentityR2; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw +( + float Pitch, + float Yaw, + float Roll +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float cp = cosf(Pitch); + float sp = sinf(Pitch); + + float cy = cosf(Yaw); + float sy = sinf(Yaw); + + float cr = cosf(Roll); + float sr = sinf(Roll); + + XMMATRIX M; + M.m[0][0] = cr * cy + sr * sp * sy; + M.m[0][1] = sr * cp; + M.m[0][2] = sr * sp * cy - cr * sy; + M.m[0][3] = 0.0f; + + M.m[1][0] = cr * sp * sy - sr * cy; + M.m[1][1] = cr * cp; + M.m[1][2] = sr * sy + cr * sp * cy; + M.m[1][3] = 0.0f; + + M.m[2][0] = cp * sy; + M.m[2][1] = -sp; + M.m[2][2] = cp * cy; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; +#else + XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); + return XMMatrixRotationRollPitchYawFromVector(Angles); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector +( + FXMVECTOR Angles // +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float cp = cosf(Angles.vector4_f32[0]); + float sp = sinf(Angles.vector4_f32[0]); + + float cy = cosf(Angles.vector4_f32[1]); + float sy = sinf(Angles.vector4_f32[1]); + + float cr = cosf(Angles.vector4_f32[2]); + float sr = sinf(Angles.vector4_f32[2]); + + XMMATRIX M; + M.m[0][0] = cr * cy + sr * sp * sy; + M.m[0][1] = sr * cp; + M.m[0][2] = sr * sp * cy - cr * sy; + M.m[0][3] = 0.0f; + + M.m[1][0] = cr * sp * sy - sr * cy; + M.m[1][1] = cr * cp; + M.m[1][2] = sr * sy + cr * sp * cy; + M.m[1][3] = 0.0f; + + M.m[2][0] = cp * sy; + M.m[2][1] = -sp; + M.m[2][2] = cp * cy; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; +#else + static const XMVECTORF32 Sign = { { { 1.0f, -1.0f, -1.0f, 1.0f } } }; + + XMVECTOR SinAngles, CosAngles; + XMVectorSinCos(&SinAngles, &CosAngles, Angles); + + XMVECTOR P0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR Y0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR P1 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR Y1 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR P2 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR P3 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR Y2 = XMVectorSplatX(SinAngles); + XMVECTOR NS = XMVectorNegate(SinAngles); + + XMVECTOR Q0 = XMVectorMultiply(P0, Y0); + XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v); + Q1 = XMVectorMultiply(Q1, Y1); + XMVECTOR Q2 = XMVectorMultiply(P2, Y2); + Q2 = XMVectorMultiplyAdd(Q2, P3, Q1); + + XMVECTOR V0 = XMVectorPermute(Q0, Q2); + XMVECTOR V1 = XMVectorPermute(Q0, Q2); + XMVECTOR V2 = XMVectorPermute(Q0, NS); + + XMMATRIX M; + M.r[0] = XMVectorSelect(g_XMZero, V0, g_XMSelect1110.v); + M.r[1] = XMVectorSelect(g_XMZero, V1, g_XMSelect1110.v); + M.r[2] = XMVectorSelect(g_XMZero, V2, g_XMSelect1110.v); + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationNormal +( + FXMVECTOR NormalAxis, + float Angle +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMVECTOR A = XMVectorSet(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f); + + XMVECTOR C2 = XMVectorSplatZ(A); + XMVECTOR C1 = XMVectorSplatY(A); + XMVECTOR C0 = XMVectorSplatX(A); + + XMVECTOR N0 = XMVectorSwizzle(NormalAxis); + XMVECTOR N1 = XMVectorSwizzle(NormalAxis); + + XMVECTOR V0 = XMVectorMultiply(C2, N0); + V0 = XMVectorMultiply(V0, N1); + + XMVECTOR R0 = XMVectorMultiply(C2, NormalAxis); + R0 = XMVectorMultiplyAdd(R0, NormalAxis, C1); + + XMVECTOR R1 = XMVectorMultiplyAdd(C0, NormalAxis, V0); + XMVECTOR R2 = XMVectorNegativeMultiplySubtract(C0, NormalAxis, V0); + + V0 = XMVectorSelect(A, R0, g_XMSelect1110.v); + XMVECTOR V1 = XMVectorPermute(R1, R2); + XMVECTOR V2 = XMVectorPermute(R1, R2); + + XMMATRIX M; + M.r[0] = XMVectorPermute(V0, V1); + M.r[1] = XMVectorPermute(V0, V1); + M.r[2] = XMVectorPermute(V0, V2); + M.r[3] = g_XMIdentityR3.v; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMVECTOR C2 = _mm_set_ps1(1.0f - fCosAngle); + XMVECTOR C1 = _mm_set_ps1(fCosAngle); + XMVECTOR C0 = _mm_set_ps1(fSinAngle); + + XMVECTOR N0 = XM_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 0, 2, 1)); + XMVECTOR N1 = XM_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 1, 0, 2)); + + XMVECTOR V0 = _mm_mul_ps(C2, N0); + V0 = _mm_mul_ps(V0, N1); + + XMVECTOR R0 = _mm_mul_ps(C2, NormalAxis); + R0 = _mm_mul_ps(R0, NormalAxis); + R0 = _mm_add_ps(R0, C1); + + XMVECTOR R1 = _mm_mul_ps(C0, NormalAxis); + R1 = _mm_add_ps(R1, V0); + XMVECTOR R2 = _mm_mul_ps(C0, NormalAxis); + R2 = _mm_sub_ps(V0, R2); + + V0 = _mm_and_ps(R0, g_XMMask3); + XMVECTOR V1 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(2, 1, 2, 0)); + V1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 3, 2, 1)); + XMVECTOR V2 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(0, 0, 1, 1)); + V2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 0, 2, 0)); + + R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(1, 0, 3, 0)); + R2 = XM_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 2, 0)); + + XMMATRIX M; + M.r[0] = R2; + + R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(3, 2, 3, 1)); + R2 = XM_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 0, 2)); + M.r[1] = R2; + + V2 = _mm_shuffle_ps(V2, V0, _MM_SHUFFLE(3, 2, 1, 0)); + M.r[2] = V2; + M.r[3] = g_XMIdentityR3.v; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationAxis +( + FXMVECTOR Axis, + float Angle +) noexcept +{ + assert(!XMVector3Equal(Axis, XMVectorZero())); + assert(!XMVector3IsInfinite(Axis)); + + XMVECTOR Normal = XMVector3Normalize(Axis); + return XMMatrixRotationNormal(Normal, Angle); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion(FXMVECTOR Quaternion) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float qx = Quaternion.vector4_f32[0]; + float qxx = qx * qx; + + float qy = Quaternion.vector4_f32[1]; + float qyy = qy * qy; + + float qz = Quaternion.vector4_f32[2]; + float qzz = qz * qz; + + float qw = Quaternion.vector4_f32[3]; + + XMMATRIX M; + M.m[0][0] = 1.f - 2.f * qyy - 2.f * qzz; + M.m[0][1] = 2.f * qx * qy + 2.f * qz * qw; + M.m[0][2] = 2.f * qx * qz - 2.f * qy * qw; + M.m[0][3] = 0.f; + + M.m[1][0] = 2.f * qx * qy - 2.f * qz * qw; + M.m[1][1] = 1.f - 2.f * qxx - 2.f * qzz; + M.m[1][2] = 2.f * qy * qz + 2.f * qx * qw; + M.m[1][3] = 0.f; + + M.m[2][0] = 2.f * qx * qz + 2.f * qy * qw; + M.m[2][1] = 2.f * qy * qz - 2.f * qx * qw; + M.m[2][2] = 1.f - 2.f * qxx - 2.f * qyy; + M.m[2][3] = 0.f; + + M.m[3][0] = 0.f; + M.m[3][1] = 0.f; + M.m[3][2] = 0.f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Constant1110 = { { { 1.0f, 1.0f, 1.0f, 0.0f } } }; + + XMVECTOR Q0 = XMVectorAdd(Quaternion, Quaternion); + XMVECTOR Q1 = XMVectorMultiply(Quaternion, Q0); + + XMVECTOR V0 = XMVectorPermute(Q1, Constant1110.v); + XMVECTOR V1 = XMVectorPermute(Q1, Constant1110.v); + XMVECTOR R0 = XMVectorSubtract(Constant1110, V0); + R0 = XMVectorSubtract(R0, V1); + + V0 = XMVectorSwizzle(Quaternion); + V1 = XMVectorSwizzle(Q0); + V0 = XMVectorMultiply(V0, V1); + + V1 = XMVectorSplatW(Quaternion); + XMVECTOR V2 = XMVectorSwizzle(Q0); + V1 = XMVectorMultiply(V1, V2); + + XMVECTOR R1 = XMVectorAdd(V0, V1); + XMVECTOR R2 = XMVectorSubtract(V0, V1); + + V0 = XMVectorPermute(R1, R2); + V1 = XMVectorPermute(R1, R2); + + XMMATRIX M; + M.r[0] = XMVectorPermute(R0, V0); + M.r[1] = XMVectorPermute(R0, V0); + M.r[2] = XMVectorPermute(R0, V1); + M.r[3] = g_XMIdentityR3.v; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Constant1110 = { { { 1.0f, 1.0f, 1.0f, 0.0f } } }; + + XMVECTOR Q0 = _mm_add_ps(Quaternion, Quaternion); + XMVECTOR Q1 = _mm_mul_ps(Quaternion, Q0); + + XMVECTOR V0 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(3, 0, 0, 1)); + V0 = _mm_and_ps(V0, g_XMMask3); + XMVECTOR V1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(3, 1, 2, 2)); + V1 = _mm_and_ps(V1, g_XMMask3); + XMVECTOR R0 = _mm_sub_ps(Constant1110, V0); + R0 = _mm_sub_ps(R0, V1); + + V0 = XM_PERMUTE_PS(Quaternion, _MM_SHUFFLE(3, 1, 0, 0)); + V1 = XM_PERMUTE_PS(Q0, _MM_SHUFFLE(3, 2, 1, 2)); + V0 = _mm_mul_ps(V0, V1); + + V1 = XM_PERMUTE_PS(Quaternion, _MM_SHUFFLE(3, 3, 3, 3)); + XMVECTOR V2 = XM_PERMUTE_PS(Q0, _MM_SHUFFLE(3, 0, 2, 1)); + V1 = _mm_mul_ps(V1, V2); + + XMVECTOR R1 = _mm_add_ps(V0, V1); + XMVECTOR R2 = _mm_sub_ps(V0, V1); + + V0 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(1, 0, 2, 1)); + V0 = XM_PERMUTE_PS(V0, _MM_SHUFFLE(1, 3, 2, 0)); + V1 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(2, 2, 0, 0)); + V1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 0, 2, 0)); + + Q1 = _mm_shuffle_ps(R0, V0, _MM_SHUFFLE(1, 0, 3, 0)); + Q1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(1, 3, 2, 0)); + + XMMATRIX M; + M.r[0] = Q1; + + Q1 = _mm_shuffle_ps(R0, V0, _MM_SHUFFLE(3, 2, 3, 1)); + Q1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(1, 3, 0, 2)); + M.r[1] = Q1; + + Q1 = _mm_shuffle_ps(V1, R0, _MM_SHUFFLE(3, 2, 1, 0)); + M.r[2] = Q1; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTransformation2D +( + FXMVECTOR ScalingOrigin, + float ScalingOrientation, + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + float Rotation, + GXMVECTOR Translation +) noexcept +{ + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1100.v, ScalingOrigin, g_XMSelect1100.v); + XMVECTOR NegScalingOrigin = XMVectorNegate(VScalingOrigin); + + XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + XMMATRIX MScalingOrientation = XMMatrixRotationZ(ScalingOrientation); + XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); + XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); + XMMATRIX MRotation = XMMatrixRotationZ(Rotation); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation, g_XMSelect1100.v); + + XMMATRIX M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTransformation +( + FXMVECTOR ScalingOrigin, + FXMVECTOR ScalingOrientationQuaternion, + FXMVECTOR Scaling, + GXMVECTOR RotationOrigin, + HXMVECTOR RotationQuaternion, + HXMVECTOR Translation +) noexcept +{ + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1110.v, ScalingOrigin, g_XMSelect1110.v); + XMVECTOR NegScalingOrigin = XMVectorNegate(ScalingOrigin); + + XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + XMMATRIX MScalingOrientation = XMMatrixRotationQuaternion(ScalingOrientationQuaternion); + XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v); + XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v); + + XMMATRIX M; + M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D +( + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + float Rotation, + FXMVECTOR Translation +) noexcept +{ + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); + XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); + XMMATRIX MRotation = XMMatrixRotationZ(Rotation); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation, g_XMSelect1100.v); + + XMMATRIX M; + M = MScaling; + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation +( + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + FXMVECTOR RotationQuaternion, + GXMVECTOR Translation +) noexcept +{ + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v); + XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v); + + XMMATRIX M; + M = MScaling; + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixReflect(FXMVECTOR ReflectionPlane) noexcept +{ + assert(!XMVector3Equal(ReflectionPlane, XMVectorZero())); + assert(!XMPlaneIsInfinite(ReflectionPlane)); + + static const XMVECTORF32 NegativeTwo = { { { -2.0f, -2.0f, -2.0f, 0.0f } } }; + + XMVECTOR P = XMPlaneNormalize(ReflectionPlane); + XMVECTOR S = XMVectorMultiply(P, NegativeTwo); + + XMVECTOR A = XMVectorSplatX(P); + XMVECTOR B = XMVectorSplatY(P); + XMVECTOR C = XMVectorSplatZ(P); + XMVECTOR D = XMVectorSplatW(P); + + XMMATRIX M; + M.r[0] = XMVectorMultiplyAdd(A, S, g_XMIdentityR0.v); + M.r[1] = XMVectorMultiplyAdd(B, S, g_XMIdentityR1.v); + M.r[2] = XMVectorMultiplyAdd(C, S, g_XMIdentityR2.v); + M.r[3] = XMVectorMultiplyAdd(D, S, g_XMIdentityR3.v); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixShadow +( + FXMVECTOR ShadowPlane, + FXMVECTOR LightPosition +) noexcept +{ + static const XMVECTORU32 Select0001 = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_0, XM_SELECT_1 } } }; + + assert(!XMVector3Equal(ShadowPlane, XMVectorZero())); + assert(!XMPlaneIsInfinite(ShadowPlane)); + + XMVECTOR P = XMPlaneNormalize(ShadowPlane); + XMVECTOR Dot = XMPlaneDot(P, LightPosition); + P = XMVectorNegate(P); + XMVECTOR D = XMVectorSplatW(P); + XMVECTOR C = XMVectorSplatZ(P); + XMVECTOR B = XMVectorSplatY(P); + XMVECTOR A = XMVectorSplatX(P); + Dot = XMVectorSelect(Select0001.v, Dot, Select0001.v); + + XMMATRIX M; + M.r[3] = XMVectorMultiplyAdd(D, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[2] = XMVectorMultiplyAdd(C, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[1] = XMVectorMultiplyAdd(B, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[0] = XMVectorMultiplyAdd(A, LightPosition, Dot); + return M; +} + +//------------------------------------------------------------------------------ +// View and projection initialization operations +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookAtLH +( + FXMVECTOR EyePosition, + FXMVECTOR FocusPosition, + FXMVECTOR UpDirection +) noexcept +{ + XMVECTOR EyeDirection = XMVectorSubtract(FocusPosition, EyePosition); + return XMMatrixLookToLH(EyePosition, EyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookAtRH +( + FXMVECTOR EyePosition, + FXMVECTOR FocusPosition, + FXMVECTOR UpDirection +) noexcept +{ + XMVECTOR NegEyeDirection = XMVectorSubtract(EyePosition, FocusPosition); + return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookToLH +( + FXMVECTOR EyePosition, + FXMVECTOR EyeDirection, + FXMVECTOR UpDirection +) noexcept +{ + assert(!XMVector3Equal(EyeDirection, XMVectorZero())); + assert(!XMVector3IsInfinite(EyeDirection)); + assert(!XMVector3Equal(UpDirection, XMVectorZero())); + assert(!XMVector3IsInfinite(UpDirection)); + + XMVECTOR R2 = XMVector3Normalize(EyeDirection); + + XMVECTOR R0 = XMVector3Cross(UpDirection, R2); + R0 = XMVector3Normalize(R0); + + XMVECTOR R1 = XMVector3Cross(R2, R0); + + XMVECTOR NegEyePosition = XMVectorNegate(EyePosition); + + XMVECTOR D0 = XMVector3Dot(R0, NegEyePosition); + XMVECTOR D1 = XMVector3Dot(R1, NegEyePosition); + XMVECTOR D2 = XMVector3Dot(R2, NegEyePosition); + + XMMATRIX M; + M.r[0] = XMVectorSelect(D0, R0, g_XMSelect1110.v); + M.r[1] = XMVectorSelect(D1, R1, g_XMSelect1110.v); + M.r[2] = XMVectorSelect(D2, R2, g_XMSelect1110.v); + M.r[3] = g_XMIdentityR3.v; + + M = XMMatrixTranspose(M); + + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookToRH +( + FXMVECTOR EyePosition, + FXMVECTOR EyeDirection, + FXMVECTOR UpDirection +) noexcept +{ + XMVECTOR NegEyeDirection = XMVectorNegate(EyeDirection); + return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable:28931, "PREfast noise: Esp:1266") +#endif + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + const float32x4_t Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(TwoNearZ / ViewHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, g_XMIdentityR3.v, 2); + M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ / ViewWidth, + TwoNearZ / ViewHeight, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // TwoNearZ / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,1.0f + vTemp = _mm_setzero_ps(); + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0 + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ - FarZ); + const float32x4_t Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(TwoNearZ / ViewHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, g_XMNegIdentityR3.v, 2); + M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ - FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ / ViewWidth, + TwoNearZ / ViewHeight, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // TwoNearZ / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,-1.0f + vValues = _mm_shuffle_ps(vValues, g_XMNegIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,-1.0f + vTemp = _mm_setzero_ps(); + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0 + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH +( + float FovAngleY, + float AspectRatio, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + float fRange = FarZ / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = Width; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Height; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float fRange = FarZ / (FarZ - NearZ); + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + const float32x4_t Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32(Width, Zero, 0); + M.r[1] = vsetq_lane_f32(Height, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, g_XMIdentityR3.v, 2); + M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float fRange = FarZ / (FarZ - NearZ); + // Note: This is recorded on the stack + float Height = CosFov / SinFov; + XMVECTOR rMem = { + Height / AspectRatio, + Height, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // Height / AspectRatio,0,0,0 + XMMATRIX M; + M.r[0] = vTemp; + // 0,Height,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,1.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH +( + float FovAngleY, + float AspectRatio, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + float fRange = FarZ / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = Width; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Height; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + float fRange = FarZ / (NearZ - FarZ); + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + const float32x4_t Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32(Width, Zero, 0); + M.r[1] = vsetq_lane_f32(Height, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, g_XMNegIdentityR3.v, 2); + M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + float fRange = FarZ / (NearZ - FarZ); + // Note: This is recorded on the stack + float Height = CosFov / SinFov; + XMVECTOR rMem = { + Height / AspectRatio, + Height, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // Height / AspectRatio,0,0,0 + XMMATRIX M; + M.r[0] = vTemp; + // 0,Height,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,-1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues, g_XMNegIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,-1.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,fRange * NearZ,0.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ * ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ * ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth; + M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ - NearZ); + const float32x4_t Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(TwoNearZ * ReciprocalHeight, Zero, 1); + M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + 1.0f); + M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ * ReciprocalWidth, + TwoNearZ * ReciprocalHeight, + -fRange * NearZ, + 0 + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // TwoNearZ*ReciprocalWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ*ReciprocalHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // 0,0,fRange,1.0f + M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + 1.0f); + // 0,0,-fRange * NearZ,0.0f + vValues = _mm_and_ps(vValues, g_XMMaskZ); + M.r[3] = vValues; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ * ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ * ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth; + M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ - FarZ); + const float32x4_t Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(TwoNearZ * ReciprocalHeight, Zero, 1); + M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth, + (ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + -1.0f); + M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ - FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ * ReciprocalWidth, + TwoNearZ * ReciprocalHeight, + fRange * NearZ, + 0 + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // TwoNearZ*ReciprocalWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ*ReciprocalHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // 0,0,fRange,1.0f + M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth, + (ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + -1.0f); + // 0,0,-fRange * NearZ,0.0f + vValues = _mm_and_ps(vValues, g_XMMaskZ); + M.r[3] = vValues; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicLH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) noexcept +{ + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float fRange = 1.0f / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = 2.0f / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 2.0f / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fRange = 1.0f / (FarZ - NearZ); + + const float32x4_t Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, Zero, 2); + M.r[3] = vsetq_lane_f32(-fRange * NearZ, g_XMIdentityR3.v, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fRange = 1.0f / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + 2.0f / ViewWidth, + 2.0f / ViewHeight, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // 2.0f / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,2.0f / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,0.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,1.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicRH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) noexcept +{ + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float fRange = 1.0f / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = 2.0f / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 2.0f / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fRange = 1.0f / (NearZ - FarZ); + + const float32x4_t Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, Zero, 2); + M.r[3] = vsetq_lane_f32(fRange * NearZ, g_XMIdentityR3.v, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fRange = 1.0f / (NearZ - FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + 2.0f / ViewWidth, + 2.0f / ViewHeight, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // 2.0f / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,2.0f / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,0.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,fRange * NearZ,1.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) noexcept +{ + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = ReciprocalWidth + ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ReciprocalHeight + ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = -(ViewLeft + ViewRight) * ReciprocalWidth; + M.m[3][1] = -(ViewTop + ViewBottom) * ReciprocalHeight; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ - NearZ); + const float32x4_t Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, Zero, 2); + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + -fRange * NearZ, + 1.0f); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + fReciprocalWidth, + fReciprocalHeight, + fRange, + 1.0f + }; + XMVECTOR rMem2 = { + -(ViewLeft + ViewRight), + -(ViewTop + ViewBottom), + -NearZ, + 1.0f + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // fReciprocalWidth*2,0,0,0 + vTemp = _mm_add_ss(vTemp, vTemp); + M.r[0] = vTemp; + // 0,fReciprocalHeight*2,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + vTemp = _mm_add_ps(vTemp, vTemp); + M.r[1] = vTemp; + // 0,0,fRange,0.0f + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskZ); + M.r[2] = vTemp; + // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f + vValues = _mm_mul_ps(vValues, rMem2); + M.r[3] = vValues; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) noexcept +{ + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = ReciprocalWidth + ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ReciprocalHeight + ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange * NearZ, + 1.0f); + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ - FarZ); + const float32x4_t Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, Zero, 2); + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange * NearZ, + 1.0f); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ - FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + fReciprocalWidth, + fReciprocalHeight, + fRange, + 1.0f + }; + XMVECTOR rMem2 = { + -(ViewLeft + ViewRight), + -(ViewTop + ViewBottom), + NearZ, + 1.0f + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // fReciprocalWidth*2,0,0,0 + vTemp = _mm_add_ss(vTemp, vTemp); + M.r[0] = vTemp; + // 0,fReciprocalHeight*2,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + vTemp = _mm_add_ps(vTemp, vTemp); + M.r[1] = vTemp; + // 0,0,fRange,0.0f + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskZ); + M.r[2] = vTemp; + // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f + vValues = _mm_mul_ps(vValues, rMem2); + M.r[3] = vValues; + return M; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +/**************************************************************************** + * + * XMMATRIX operators and methods + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMMATRIX::XMMATRIX +( + float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33 +) noexcept +{ + r[0] = XMVectorSet(m00, m01, m02, m03); + r[1] = XMVectorSet(m10, m11, m12, m13); + r[2] = XMVectorSet(m20, m21, m22, m23); + r[3] = XMVectorSet(m30, m31, m32, m33); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX::XMMATRIX(const float* pArray) noexcept +{ + assert(pArray != nullptr); + r[0] = XMLoadFloat4(reinterpret_cast(pArray)); + r[1] = XMLoadFloat4(reinterpret_cast(pArray + 4)); + r[2] = XMLoadFloat4(reinterpret_cast(pArray + 8)); + r[3] = XMLoadFloat4(reinterpret_cast(pArray + 12)); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator- () const noexcept +{ + XMMATRIX R; + R.r[0] = XMVectorNegate(r[0]); + R.r[1] = XMVectorNegate(r[1]); + R.r[2] = XMVectorNegate(r[2]); + R.r[3] = XMVectorNegate(r[3]); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XM_CALLCONV XMMATRIX::operator+= (FXMMATRIX M) noexcept +{ + r[0] = XMVectorAdd(r[0], M.r[0]); + r[1] = XMVectorAdd(r[1], M.r[1]); + r[2] = XMVectorAdd(r[2], M.r[2]); + r[3] = XMVectorAdd(r[3], M.r[3]); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XM_CALLCONV XMMATRIX::operator-= (FXMMATRIX M) noexcept +{ + r[0] = XMVectorSubtract(r[0], M.r[0]); + r[1] = XMVectorSubtract(r[1], M.r[1]); + r[2] = XMVectorSubtract(r[2], M.r[2]); + r[3] = XMVectorSubtract(r[3], M.r[3]); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XM_CALLCONV XMMATRIX::operator*=(FXMMATRIX M) noexcept +{ + *this = XMMatrixMultiply(*this, M); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XMMATRIX::operator*= (float S) noexcept +{ + r[0] = XMVectorScale(r[0], S); + r[1] = XMVectorScale(r[1], S); + r[2] = XMVectorScale(r[2], S); + r[3] = XMVectorScale(r[3], S); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XMMATRIX::operator/= (float S) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vS = XMVectorReplicate(S); + r[0] = XMVectorDivide(r[0], vS); + r[1] = XMVectorDivide(r[1], vS); + r[2] = XMVectorDivide(r[2], vS); + r[3] = XMVectorDivide(r[3], vS); + return *this; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + float32x4_t vS = vdupq_n_f32(S); + r[0] = vdivq_f32(r[0], vS); + r[1] = vdivq_f32(r[1], vS); + r[2] = vdivq_f32(r[2], vS); + r[3] = vdivq_f32(r[3], vS); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x2_t vS = vdup_n_f32(S); + float32x2_t R0 = vrecpe_f32(vS); + float32x2_t S0 = vrecps_f32(R0, vS); + R0 = vmul_f32(S0, R0); + S0 = vrecps_f32(R0, vS); + R0 = vmul_f32(S0, R0); + float32x4_t Reciprocal = vcombine_f32(R0, R0); + r[0] = vmulq_f32(r[0], Reciprocal); + r[1] = vmulq_f32(r[1], Reciprocal); + r[2] = vmulq_f32(r[2], Reciprocal); + r[3] = vmulq_f32(r[3], Reciprocal); +#endif + return *this; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 vS = _mm_set_ps1(S); + r[0] = _mm_div_ps(r[0], vS); + r[1] = _mm_div_ps(r[1], vS); + r[2] = _mm_div_ps(r[2], vS); + r[3] = _mm_div_ps(r[3], vS); + return *this; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMATRIX::operator+ (FXMMATRIX M) const noexcept +{ + XMMATRIX R; + R.r[0] = XMVectorAdd(r[0], M.r[0]); + R.r[1] = XMVectorAdd(r[1], M.r[1]); + R.r[2] = XMVectorAdd(r[2], M.r[2]); + R.r[3] = XMVectorAdd(r[3], M.r[3]); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMATRIX::operator- (FXMMATRIX M) const noexcept +{ + XMMATRIX R; + R.r[0] = XMVectorSubtract(r[0], M.r[0]); + R.r[1] = XMVectorSubtract(r[1], M.r[1]); + R.r[2] = XMVectorSubtract(r[2], M.r[2]); + R.r[3] = XMVectorSubtract(r[3], M.r[3]); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMATRIX::operator*(FXMMATRIX M) const noexcept +{ + return XMMatrixMultiply(*this, M); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator* (float S) const noexcept +{ + XMMATRIX R; + R.r[0] = XMVectorScale(r[0], S); + R.r[1] = XMVectorScale(r[1], S); + R.r[2] = XMVectorScale(r[2], S); + R.r[3] = XMVectorScale(r[3], S); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator/ (float S) const noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vS = XMVectorReplicate(S); + XMMATRIX R; + R.r[0] = XMVectorDivide(r[0], vS); + R.r[1] = XMVectorDivide(r[1], vS); + R.r[2] = XMVectorDivide(r[2], vS); + R.r[3] = XMVectorDivide(r[3], vS); + return R; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + float32x4_t vS = vdupq_n_f32(S); + XMMATRIX R; + R.r[0] = vdivq_f32(r[0], vS); + R.r[1] = vdivq_f32(r[1], vS); + R.r[2] = vdivq_f32(r[2], vS); + R.r[3] = vdivq_f32(r[3], vS); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x2_t vS = vdup_n_f32(S); + float32x2_t R0 = vrecpe_f32(vS); + float32x2_t S0 = vrecps_f32(R0, vS); + R0 = vmul_f32(S0, R0); + S0 = vrecps_f32(R0, vS); + R0 = vmul_f32(S0, R0); + float32x4_t Reciprocal = vcombine_f32(R0, R0); + XMMATRIX R; + R.r[0] = vmulq_f32(r[0], Reciprocal); + R.r[1] = vmulq_f32(r[1], Reciprocal); + R.r[2] = vmulq_f32(r[2], Reciprocal); + R.r[3] = vmulq_f32(r[3], Reciprocal); +#endif + return R; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 vS = _mm_set_ps1(S); + XMMATRIX R; + R.r[0] = _mm_div_ps(r[0], vS); + R.r[1] = _mm_div_ps(r[1], vS); + R.r[2] = _mm_div_ps(r[2], vS); + R.r[3] = _mm_div_ps(r[3], vS); + return R; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV operator* +( + float S, + FXMMATRIX M + ) noexcept +{ + XMMATRIX R; + R.r[0] = XMVectorScale(M.r[0], S); + R.r[1] = XMVectorScale(M.r[1], S); + R.r[2] = XMVectorScale(M.r[2], S); + R.r[3] = XMVectorScale(M.r[3], S); + return R; +} + +/**************************************************************************** + * + * XMFLOAT3X3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT3X3::XMFLOAT3X3(const float* pArray) noexcept +{ + assert(pArray != nullptr); + for (size_t Row = 0; Row < 3; Row++) + { + for (size_t Column = 0; Column < 3; Column++) + { + m[Row][Column] = pArray[Row * 3 + Column]; + } + } +} + +/**************************************************************************** + * + * XMFLOAT4X3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4X3::XMFLOAT4X3(const float* pArray) noexcept +{ + assert(pArray != nullptr); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + + m[1][0] = pArray[3]; + m[1][1] = pArray[4]; + m[1][2] = pArray[5]; + + m[2][0] = pArray[6]; + m[2][1] = pArray[7]; + m[2][2] = pArray[8]; + + m[3][0] = pArray[9]; + m[3][1] = pArray[10]; + m[3][2] = pArray[11]; +} + +/**************************************************************************** +* +* XMFLOAT3X4 operators +* +****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT3X4::XMFLOAT3X4(const float* pArray) noexcept +{ + assert(pArray != nullptr); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + m[0][3] = pArray[3]; + + m[1][0] = pArray[4]; + m[1][1] = pArray[5]; + m[1][2] = pArray[6]; + m[1][3] = pArray[7]; + + m[2][0] = pArray[8]; + m[2][1] = pArray[9]; + m[2][2] = pArray[10]; + m[2][3] = pArray[11]; +} + +/**************************************************************************** + * + * XMFLOAT4X4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4X4::XMFLOAT4X4(const float* pArray) noexcept +{ + assert(pArray != nullptr); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + m[0][3] = pArray[3]; + + m[1][0] = pArray[4]; + m[1][1] = pArray[5]; + m[1][2] = pArray[6]; + m[1][3] = pArray[7]; + + m[2][0] = pArray[8]; + m[2][1] = pArray[9]; + m[2][2] = pArray[10]; + m[2][3] = pArray[11]; + + m[3][0] = pArray[12]; + m[3][1] = pArray[13]; + m[3][2] = pArray[14]; + m[3][3] = pArray[15]; +} + diff --git a/include/directxmath/directxmathmisc.inl b/include/directxmath/directxmathmisc.inl new file mode 100644 index 0000000..c05f1ca --- /dev/null +++ b/include/directxmath/directxmathmisc.inl @@ -0,0 +1,2492 @@ +//------------------------------------------------------------------------------------- +// DirectXMathMisc.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// https://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Quaternion + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionEqual +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) noexcept +{ + return XMVector4Equal(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionNotEqual +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) noexcept +{ + return XMVector4NotEqual(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionIsNaN(FXMVECTOR Q) noexcept +{ + return XMVector4IsNaN(Q); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionIsInfinite(FXMVECTOR Q) noexcept +{ + return XMVector4IsInfinite(Q); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionIsIdentity(FXMVECTOR Q) noexcept +{ + return XMVector4Equal(Q, g_XMIdentityR3.v); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionDot +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) noexcept +{ + return XMVector4Dot(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionMultiply +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) noexcept +{ + // Returns the product Q2*Q1 (which is the concatenation of a rotation Q1 followed by the rotation Q2) + + // [ (Q2.w * Q1.x) + (Q2.x * Q1.w) + (Q2.y * Q1.z) - (Q2.z * Q1.y), + // (Q2.w * Q1.y) - (Q2.x * Q1.z) + (Q2.y * Q1.w) + (Q2.z * Q1.x), + // (Q2.w * Q1.z) + (Q2.x * Q1.y) - (Q2.y * Q1.x) + (Q2.z * Q1.w), + // (Q2.w * Q1.w) - (Q2.x * Q1.x) - (Q2.y * Q1.y) - (Q2.z * Q1.z) ] + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + (Q2.vector4_f32[3] * Q1.vector4_f32[0]) + (Q2.vector4_f32[0] * Q1.vector4_f32[3]) + (Q2.vector4_f32[1] * Q1.vector4_f32[2]) - (Q2.vector4_f32[2] * Q1.vector4_f32[1]), + (Q2.vector4_f32[3] * Q1.vector4_f32[1]) - (Q2.vector4_f32[0] * Q1.vector4_f32[2]) + (Q2.vector4_f32[1] * Q1.vector4_f32[3]) + (Q2.vector4_f32[2] * Q1.vector4_f32[0]), + (Q2.vector4_f32[3] * Q1.vector4_f32[2]) + (Q2.vector4_f32[0] * Q1.vector4_f32[1]) - (Q2.vector4_f32[1] * Q1.vector4_f32[0]) + (Q2.vector4_f32[2] * Q1.vector4_f32[3]), + (Q2.vector4_f32[3] * Q1.vector4_f32[3]) - (Q2.vector4_f32[0] * Q1.vector4_f32[0]) - (Q2.vector4_f32[1] * Q1.vector4_f32[1]) - (Q2.vector4_f32[2] * Q1.vector4_f32[2]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ControlWZYX = { { { 1.0f, -1.0f, 1.0f, -1.0f } } }; + static const XMVECTORF32 ControlZWXY = { { { 1.0f, 1.0f, -1.0f, -1.0f } } }; + static const XMVECTORF32 ControlYXWZ = { { { -1.0f, 1.0f, 1.0f, -1.0f } } }; + + float32x2_t Q2L = vget_low_f32(Q2); + float32x2_t Q2H = vget_high_f32(Q2); + + float32x4_t Q2X = vdupq_lane_f32(Q2L, 0); + float32x4_t Q2Y = vdupq_lane_f32(Q2L, 1); + float32x4_t Q2Z = vdupq_lane_f32(Q2H, 0); + XMVECTOR vResult = vmulq_lane_f32(Q1, Q2H, 1); + + // Mul by Q1WZYX + float32x4_t vTemp = vrev64q_f32(Q1); + vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp)); + Q2X = vmulq_f32(Q2X, vTemp); + vResult = vmlaq_f32(vResult, Q2X, ControlWZYX); + + // Mul by Q1ZWXY + vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp))); + Q2Y = vmulq_f32(Q2Y, vTemp); + vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY); + + // Mul by Q1YXWZ + vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp))); + vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp)); + Q2Z = vmulq_f32(Q2Z, vTemp); + vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ControlWZYX = { { { 1.0f, -1.0f, 1.0f, -1.0f } } }; + static const XMVECTORF32 ControlZWXY = { { { 1.0f, 1.0f, -1.0f, -1.0f } } }; + static const XMVECTORF32 ControlYXWZ = { { { -1.0f, 1.0f, 1.0f, -1.0f } } }; + // Copy to SSE registers and use as few as possible for x86 + XMVECTOR Q2X = Q2; + XMVECTOR Q2Y = Q2; + XMVECTOR Q2Z = Q2; + XMVECTOR vResult = Q2; + // Splat with one instruction + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 3, 3, 3)); + Q2X = XM_PERMUTE_PS(Q2X, _MM_SHUFFLE(0, 0, 0, 0)); + Q2Y = XM_PERMUTE_PS(Q2Y, _MM_SHUFFLE(1, 1, 1, 1)); + Q2Z = XM_PERMUTE_PS(Q2Z, _MM_SHUFFLE(2, 2, 2, 2)); + // Retire Q1 and perform Q1*Q2W + vResult = _mm_mul_ps(vResult, Q1); + XMVECTOR Q1Shuffle = Q1; + // Shuffle the copies of Q1 + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(0, 1, 2, 3)); + // Mul by Q1WZYX + Q2X = _mm_mul_ps(Q2X, Q1Shuffle); + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(2, 3, 0, 1)); + // Flip the signs on y and z + vResult = XM_FMADD_PS(Q2X, ControlWZYX, vResult); + // Mul by Q1ZWXY + Q2Y = _mm_mul_ps(Q2Y, Q1Shuffle); + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(0, 1, 2, 3)); + // Flip the signs on z and w + Q2Y = _mm_mul_ps(Q2Y, ControlZWXY); + // Mul by Q1YXWZ + Q2Z = _mm_mul_ps(Q2Z, Q1Shuffle); + // Flip the signs on x and w + Q2Y = XM_FMADD_PS(Q2Z, ControlYXWZ, Q2Y); + vResult = _mm_add_ps(vResult, Q2Y); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionLengthSq(FXMVECTOR Q) noexcept +{ + return XMVector4LengthSq(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength(FXMVECTOR Q) noexcept +{ + return XMVector4ReciprocalLength(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionLength(FXMVECTOR Q) noexcept +{ + return XMVector4Length(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst(FXMVECTOR Q) noexcept +{ + return XMVector4NormalizeEst(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionNormalize(FXMVECTOR Q) noexcept +{ + return XMVector4Normalize(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionConjugate(FXMVECTOR Q) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + -Q.vector4_f32[0], + -Q.vector4_f32[1], + -Q.vector4_f32[2], + Q.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 NegativeOne3 = { { { -1.0f, -1.0f, -1.0f, 1.0f } } }; + return vmulq_f32(Q, NegativeOne3.v); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 NegativeOne3 = { { { -1.0f, -1.0f, -1.0f, 1.0f } } }; + return _mm_mul_ps(Q, NegativeOne3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionInverse(FXMVECTOR Q) noexcept +{ + XMVECTOR L = XMVector4LengthSq(Q); + XMVECTOR Conjugate = XMQuaternionConjugate(Q); + + XMVECTOR Control = XMVectorLessOrEqual(L, g_XMEpsilon.v); + + XMVECTOR Result = XMVectorDivide(Conjugate, L); + + Result = XMVectorSelect(Result, g_XMZero, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionLn(FXMVECTOR Q) noexcept +{ + static const XMVECTORF32 OneMinusEpsilon = { { { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f } } }; + + XMVECTOR QW = XMVectorSplatW(Q); + XMVECTOR Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v); + + XMVECTOR ControlW = XMVectorInBounds(QW, OneMinusEpsilon.v); + + XMVECTOR Theta = XMVectorACos(QW); + XMVECTOR SinTheta = XMVectorSin(Theta); + + XMVECTOR S = XMVectorDivide(Theta, SinTheta); + + XMVECTOR Result = XMVectorMultiply(Q0, S); + Result = XMVectorSelect(Q0, Result, ControlW); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionExp(FXMVECTOR Q) noexcept +{ + XMVECTOR Theta = XMVector3Length(Q); + + XMVECTOR SinTheta, CosTheta; + XMVectorSinCos(&SinTheta, &CosTheta, Theta); + + XMVECTOR S = XMVectorDivide(SinTheta, Theta); + + XMVECTOR Result = XMVectorMultiply(Q, S); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v); + Result = XMVectorSelect(Result, Q, Control); + + Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSlerp +( + FXMVECTOR Q0, + FXMVECTOR Q1, + float t +) noexcept +{ + XMVECTOR T = XMVectorReplicate(t); + return XMQuaternionSlerpV(Q0, Q1, T); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSlerpV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR T +) noexcept +{ + assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T))); + + // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega) + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + const XMVECTORF32 OneMinusEpsilon = { { { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f } } }; + + XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorLess(CosOmega, Zero); + XMVECTOR Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control); + + CosOmega = XMVectorMultiply(CosOmega, Sign); + + Control = XMVectorLess(CosOmega, OneMinusEpsilon); + + XMVECTOR SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v); + SinOmega = XMVectorSqrt(SinOmega); + + XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega); + + XMVECTOR SignMask = XMVectorSplatSignMask(); + XMVECTOR V01 = XMVectorShiftLeft(T, Zero, 2); + SignMask = XMVectorShiftLeft(SignMask, Zero, 3); + V01 = XMVectorXorInt(V01, SignMask); + V01 = XMVectorAdd(g_XMIdentityR0.v, V01); + + XMVECTOR InvSinOmega = XMVectorReciprocal(SinOmega); + + XMVECTOR S0 = XMVectorMultiply(V01, Omega); + S0 = XMVectorSin(S0); + S0 = XMVectorMultiply(S0, InvSinOmega); + + S0 = XMVectorSelect(V01, S0, Control); + + XMVECTOR S1 = XMVectorSplatY(S0); + S0 = XMVectorSplatX(S0); + + S1 = XMVectorMultiply(S1, Sign); + + XMVECTOR Result = XMVectorMultiply(Q0, S0); + Result = XMVectorMultiplyAdd(Q1, S1, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 OneMinusEpsilon = { { { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f } } }; + static const XMVECTORU32 SignMask2 = { { { 0x80000000, 0x00000000, 0x00000000, 0x00000000 } } }; + + XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorLess(CosOmega, Zero); + XMVECTOR Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control); + + CosOmega = _mm_mul_ps(CosOmega, Sign); + + Control = XMVectorLess(CosOmega, OneMinusEpsilon); + + XMVECTOR SinOmega = _mm_mul_ps(CosOmega, CosOmega); + SinOmega = _mm_sub_ps(g_XMOne, SinOmega); + SinOmega = _mm_sqrt_ps(SinOmega); + + XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega); + + XMVECTOR V01 = XM_PERMUTE_PS(T, _MM_SHUFFLE(2, 3, 0, 1)); + V01 = _mm_and_ps(V01, g_XMMaskXY); + V01 = _mm_xor_ps(V01, SignMask2); + V01 = _mm_add_ps(g_XMIdentityR0, V01); + + XMVECTOR S0 = _mm_mul_ps(V01, Omega); + S0 = XMVectorSin(S0); + S0 = _mm_div_ps(S0, SinOmega); + + S0 = XMVectorSelect(V01, S0, Control); + + XMVECTOR S1 = XMVectorSplatY(S0); + S0 = XMVectorSplatX(S0); + + S1 = _mm_mul_ps(S1, Sign); + XMVECTOR Result = _mm_mul_ps(Q0, S0); + S1 = _mm_mul_ps(S1, Q1); + Result = _mm_add_ps(Result, S1); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSquad +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3, + float t +) noexcept +{ + XMVECTOR T = XMVectorReplicate(t); + return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSquadV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3, + HXMVECTOR T +) noexcept +{ + assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T))); + + XMVECTOR TP = T; + const XMVECTOR Two = XMVectorSplatConstant(2, 0); + + XMVECTOR Q03 = XMQuaternionSlerpV(Q0, Q3, T); + XMVECTOR Q12 = XMQuaternionSlerpV(Q1, Q2, T); + + TP = XMVectorNegativeMultiplySubtract(TP, TP, TP); + TP = XMVectorMultiply(TP, Two); + + XMVECTOR Result = XMQuaternionSlerpV(Q03, Q12, TP); + + return Result; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMQuaternionSquadSetup +( + XMVECTOR* pA, + XMVECTOR* pB, + XMVECTOR* pC, + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3 +) noexcept +{ + assert(pA); + assert(pB); + assert(pC); + + XMVECTOR LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2)); + XMVECTOR LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2)); + XMVECTOR SQ2 = XMVectorNegate(Q2); + + XMVECTOR Control1 = XMVectorLess(LS12, LD12); + SQ2 = XMVectorSelect(Q2, SQ2, Control1); + + XMVECTOR LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1)); + XMVECTOR LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1)); + XMVECTOR SQ0 = XMVectorNegate(Q0); + + XMVECTOR LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3)); + XMVECTOR LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3)); + XMVECTOR SQ3 = XMVectorNegate(Q3); + + XMVECTOR Control0 = XMVectorLess(LS01, LD01); + XMVECTOR Control2 = XMVectorLess(LS23, LD23); + + SQ0 = XMVectorSelect(Q0, SQ0, Control0); + SQ3 = XMVectorSelect(Q3, SQ3, Control2); + + XMVECTOR InvQ1 = XMQuaternionInverse(Q1); + XMVECTOR InvQ2 = XMQuaternionInverse(SQ2); + + XMVECTOR LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0)); + XMVECTOR LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2)); + XMVECTOR LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1)); + XMVECTOR LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3)); + + const XMVECTOR NegativeOneQuarter = XMVectorSplatConstant(-1, 2); + + XMVECTOR ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter); + XMVECTOR ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter); + ExpQ02 = XMQuaternionExp(ExpQ02); + ExpQ13 = XMQuaternionExp(ExpQ13); + + *pA = XMQuaternionMultiply(Q1, ExpQ02); + *pB = XMQuaternionMultiply(SQ2, ExpQ13); + *pC = SQ2; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentric +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + float f, + float g +) noexcept +{ + float s = f + g; + + XMVECTOR Result; + if ((s < 0.00001f) && (s > -0.00001f)) + { + Result = Q0; + } + else + { + XMVECTOR Q01 = XMQuaternionSlerp(Q0, Q1, s); + XMVECTOR Q02 = XMQuaternionSlerp(Q0, Q2, s); + + Result = XMQuaternionSlerp(Q01, Q02, g / s); + } + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR F, + HXMVECTOR G +) noexcept +{ + assert((XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F))); + assert((XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G))); + + const XMVECTOR Epsilon = XMVectorSplatConstant(1, 16); + + XMVECTOR S = XMVectorAdd(F, G); + + XMVECTOR Result; + if (XMVector4InBounds(S, Epsilon)) + { + Result = Q0; + } + else + { + XMVECTOR Q01 = XMQuaternionSlerpV(Q0, Q1, S); + XMVECTOR Q02 = XMQuaternionSlerpV(Q0, Q2, S); + XMVECTOR GS = XMVectorReciprocal(S); + GS = XMVectorMultiply(G, GS); + + Result = XMQuaternionSlerpV(Q01, Q02, GS); + } + + return Result; +} + +//------------------------------------------------------------------------------ +// Transformation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionIdentity() noexcept +{ + return g_XMIdentityR3.v; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw +( + float Pitch, + float Yaw, + float Roll +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + const float halfpitch = Pitch * 0.5f; + float cp = cosf(halfpitch); + float sp = sinf(halfpitch); + + const float halfyaw = Yaw * 0.5f; + float cy = cosf(halfyaw); + float sy = sinf(halfyaw); + + const float halfroll = Roll * 0.5f; + float cr = cosf(halfroll); + float sr = sinf(halfroll); + + XMVECTORF32 vResult = { { { + cr * sp * cy + sr * cp * sy, + cr * cp * sy - sr * sp * cy, + sr * cp * cy - cr * sp * sy, + cr * cp * cy + sr * sp * sy + } } }; + return vResult; +#else + XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); + return XMQuaternionRotationRollPitchYawFromVector(Angles); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector +( + FXMVECTOR Angles // +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + const float halfpitch = Angles.vector4_f32[0] * 0.5f; + float cp = cosf(halfpitch); + float sp = sinf(halfpitch); + + const float halfyaw = Angles.vector4_f32[1] * 0.5f; + float cy = cosf(halfyaw); + float sy = sinf(halfyaw); + + const float halfroll = Angles.vector4_f32[2] * 0.5f; + float cr = cosf(halfroll); + float sr = sinf(halfroll); + + XMVECTORF32 vResult = { { { + cr * sp * cy + sr * cp * sy, + cr * cp * sy - sr * sp * cy, + sr * cp * cy - cr * sp * sy, + cr * cp * cy + sr * sp * sy + } } }; + return vResult; +#else + static const XMVECTORF32 Sign = { { { 1.0f, -1.0f, -1.0f, 1.0f } } }; + + XMVECTOR HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v); + + XMVECTOR SinAngles, CosAngles; + XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles); + + XMVECTOR P0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR Y0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR R0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR P1 = XMVectorPermute(CosAngles, SinAngles); + XMVECTOR Y1 = XMVectorPermute(CosAngles, SinAngles); + XMVECTOR R1 = XMVectorPermute(CosAngles, SinAngles); + + XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v); + XMVECTOR Q0 = XMVectorMultiply(P0, Y0); + Q1 = XMVectorMultiply(Q1, Y1); + Q0 = XMVectorMultiply(Q0, R0); + XMVECTOR Q = XMVectorMultiplyAdd(Q1, R1, Q0); + + return Q; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationNormal +( + FXMVECTOR NormalAxis, + float Angle +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v); + + float SinV, CosV; + XMScalarSinCos(&SinV, &CosV, 0.5f * Angle); + + XMVECTOR Scale = XMVectorSet(SinV, SinV, SinV, CosV); + return XMVectorMultiply(N, Scale); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR N = _mm_and_ps(NormalAxis, g_XMMask3); + N = _mm_or_ps(N, g_XMIdentityR3); + XMVECTOR Scale = _mm_set_ps1(0.5f * Angle); + XMVECTOR vSine; + XMVECTOR vCosine; + XMVectorSinCos(&vSine, &vCosine, Scale); + Scale = _mm_and_ps(vSine, g_XMMask3); + vCosine = _mm_and_ps(vCosine, g_XMMaskW); + Scale = _mm_or_ps(Scale, vCosine); + N = _mm_mul_ps(N, Scale); + return N; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationAxis +( + FXMVECTOR Axis, + float Angle +) noexcept +{ + assert(!XMVector3Equal(Axis, XMVectorZero())); + assert(!XMVector3IsInfinite(Axis)); + + XMVECTOR Normal = XMVector3Normalize(Axis); + XMVECTOR Q = XMQuaternionRotationNormal(Normal, Angle); + return Q; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 q; + float r22 = M.m[2][2]; + if (r22 <= 0.f) // x^2 + y^2 >= z^2 + w^2 + { + float dif10 = M.m[1][1] - M.m[0][0]; + float omr22 = 1.f - r22; + if (dif10 <= 0.f) // x^2 >= y^2 + { + float fourXSqr = omr22 - dif10; + float inv4x = 0.5f / sqrtf(fourXSqr); + q.f[0] = fourXSqr * inv4x; + q.f[1] = (M.m[0][1] + M.m[1][0]) * inv4x; + q.f[2] = (M.m[0][2] + M.m[2][0]) * inv4x; + q.f[3] = (M.m[1][2] - M.m[2][1]) * inv4x; + } + else // y^2 >= x^2 + { + float fourYSqr = omr22 + dif10; + float inv4y = 0.5f / sqrtf(fourYSqr); + q.f[0] = (M.m[0][1] + M.m[1][0]) * inv4y; + q.f[1] = fourYSqr * inv4y; + q.f[2] = (M.m[1][2] + M.m[2][1]) * inv4y; + q.f[3] = (M.m[2][0] - M.m[0][2]) * inv4y; + } + } + else // z^2 + w^2 >= x^2 + y^2 + { + float sum10 = M.m[1][1] + M.m[0][0]; + float opr22 = 1.f + r22; + if (sum10 <= 0.f) // z^2 >= w^2 + { + float fourZSqr = opr22 - sum10; + float inv4z = 0.5f / sqrtf(fourZSqr); + q.f[0] = (M.m[0][2] + M.m[2][0]) * inv4z; + q.f[1] = (M.m[1][2] + M.m[2][1]) * inv4z; + q.f[2] = fourZSqr * inv4z; + q.f[3] = (M.m[0][1] - M.m[1][0]) * inv4z; + } + else // w^2 >= z^2 + { + float fourWSqr = opr22 + sum10; + float inv4w = 0.5f / sqrtf(fourWSqr); + q.f[0] = (M.m[1][2] - M.m[2][1]) * inv4w; + q.f[1] = (M.m[2][0] - M.m[0][2]) * inv4w; + q.f[2] = (M.m[0][1] - M.m[1][0]) * inv4w; + q.f[3] = fourWSqr * inv4w; + } + } + return q.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 XMPMMP = { { { +1.0f, -1.0f, -1.0f, +1.0f } } }; + static const XMVECTORF32 XMMPMP = { { { -1.0f, +1.0f, -1.0f, +1.0f } } }; + static const XMVECTORF32 XMMMPP = { { { -1.0f, -1.0f, +1.0f, +1.0f } } }; + static const XMVECTORU32 Select0110 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 } } }; + static const XMVECTORU32 Select0010 = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } }; + + float32x4_t r0 = M.r[0]; + float32x4_t r1 = M.r[1]; + float32x4_t r2 = M.r[2]; + + float32x4_t r00 = vdupq_lane_f32(vget_low_f32(r0), 0); + float32x4_t r11 = vdupq_lane_f32(vget_low_f32(r1), 1); + float32x4_t r22 = vdupq_lane_f32(vget_high_f32(r2), 0); + + // x^2 >= y^2 equivalent to r11 - r00 <= 0 + float32x4_t r11mr00 = vsubq_f32(r11, r00); + uint32x4_t x2gey2 = vcleq_f32(r11mr00, g_XMZero); + + // z^2 >= w^2 equivalent to r11 + r00 <= 0 + float32x4_t r11pr00 = vaddq_f32(r11, r00); + uint32x4_t z2gew2 = vcleq_f32(r11pr00, g_XMZero); + + // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0 + uint32x4_t x2py2gez2pw2 = vcleq_f32(r22, g_XMZero); + + // (4*x^2, 4*y^2, 4*z^2, 4*w^2) + float32x4_t t0 = vmulq_f32(XMPMMP, r00); + float32x4_t x2y2z2w2 = vmlaq_f32(t0, XMMPMP, r11); + x2y2z2w2 = vmlaq_f32(x2y2z2w2, XMMMPP, r22); + x2y2z2w2 = vaddq_f32(x2y2z2w2, g_XMOne); + + // (r01, r02, r12, r11) + t0 = vextq_f32(r0, r0, 1); + float32x4_t t1 = vextq_f32(r1, r1, 1); + t0 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_low_f32(t1))); + + // (r10, r20, r21, r10) + t1 = vextq_f32(r2, r2, 3); + float32x4_t r10 = vdupq_lane_f32(vget_low_f32(r1), 0); + t1 = vbslq_f32(Select0110, t1, r10); + + // (4*x*y, 4*x*z, 4*y*z, unused) + float32x4_t xyxzyz = vaddq_f32(t0, t1); + + // (r21, r20, r10, r10) + t0 = vcombine_f32(vrev64_f32(vget_low_f32(r2)), vget_low_f32(r10)); + + // (r12, r02, r01, r12) + float32x4_t t2 = vcombine_f32(vrev64_f32(vget_high_f32(r0)), vrev64_f32(vget_low_f32(r0))); + float32x4_t t3 = vdupq_lane_f32(vget_high_f32(r1), 0); + t1 = vbslq_f32(Select0110, t2, t3); + + // (4*x*w, 4*y*w, 4*z*w, unused) + float32x4_t xwywzw = vsubq_f32(t0, t1); + xwywzw = vmulq_f32(XMMPMP, xwywzw); + + // (4*x*x, 4*x*y, 4*x*z, 4*x*w) + t0 = vextq_f32(xyxzyz, xyxzyz, 3); + t1 = vbslq_f32(Select0110, t0, x2y2z2w2); + t2 = vdupq_lane_f32(vget_low_f32(xwywzw), 0); + float32x4_t tensor0 = vbslq_f32(g_XMSelect1110, t1, t2); + + // (4*y*x, 4*y*y, 4*y*z, 4*y*w) + t0 = vbslq_f32(g_XMSelect1011, xyxzyz, x2y2z2w2); + t1 = vdupq_lane_f32(vget_low_f32(xwywzw), 1); + float32x4_t tensor1 = vbslq_f32(g_XMSelect1110, t0, t1); + + // (4*z*x, 4*z*y, 4*z*z, 4*z*w) + t0 = vextq_f32(xyxzyz, xyxzyz, 1); + t1 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_high_f32(xwywzw))); + float32x4_t tensor2 = vbslq_f32(Select0010, x2y2z2w2, t1); + + // (4*w*x, 4*w*y, 4*w*z, 4*w*w) + float32x4_t tensor3 = vbslq_f32(g_XMSelect1110, xwywzw, x2y2z2w2); + + // Select the row of the tensor-product matrix that has the largest + // magnitude. + t0 = vbslq_f32(x2gey2, tensor0, tensor1); + t1 = vbslq_f32(z2gew2, tensor2, tensor3); + t2 = vbslq_f32(x2py2gez2pw2, t0, t1); + + // Normalize the row. No division by zero is possible because the + // quaternion is unit-length (and the row is a nonzero multiple of + // the quaternion). + t0 = XMVector4Length(t2); + return XMVectorDivide(t2, t0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 XMPMMP = { { { +1.0f, -1.0f, -1.0f, +1.0f } } }; + static const XMVECTORF32 XMMPMP = { { { -1.0f, +1.0f, -1.0f, +1.0f } } }; + static const XMVECTORF32 XMMMPP = { { { -1.0f, -1.0f, +1.0f, +1.0f } } }; + + XMVECTOR r0 = M.r[0]; // (r00, r01, r02, 0) + XMVECTOR r1 = M.r[1]; // (r10, r11, r12, 0) + XMVECTOR r2 = M.r[2]; // (r20, r21, r22, 0) + + // (r00, r00, r00, r00) + XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0, 0, 0, 0)); + // (r11, r11, r11, r11) + XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1, 1, 1, 1)); + // (r22, r22, r22, r22) + XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2, 2, 2, 2)); + + // x^2 >= y^2 equivalent to r11 - r00 <= 0 + // (r11 - r00, r11 - r00, r11 - r00, r11 - r00) + XMVECTOR r11mr00 = _mm_sub_ps(r11, r00); + XMVECTOR x2gey2 = _mm_cmple_ps(r11mr00, g_XMZero); + + // z^2 >= w^2 equivalent to r11 + r00 <= 0 + // (r11 + r00, r11 + r00, r11 + r00, r11 + r00) + XMVECTOR r11pr00 = _mm_add_ps(r11, r00); + XMVECTOR z2gew2 = _mm_cmple_ps(r11pr00, g_XMZero); + + // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0 + XMVECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, g_XMZero); + + // (4*x^2, 4*y^2, 4*z^2, 4*w^2) + XMVECTOR t0 = XM_FMADD_PS(XMPMMP, r00, g_XMOne); + XMVECTOR t1 = _mm_mul_ps(XMMPMP, r11); + XMVECTOR t2 = XM_FMADD_PS(XMMMPP, r22, t0); + XMVECTOR x2y2z2w2 = _mm_add_ps(t1, t2); + + // (r01, r02, r12, r11) + t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 2, 2, 1)); + // (r10, r10, r20, r21) + t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1, 0, 0, 0)); + // (r10, r20, r21, r10) + t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0)); + // (4*x*y, 4*x*z, 4*y*z, unused) + XMVECTOR xyxzyz = _mm_add_ps(t0, t1); + + // (r21, r20, r10, r10) + t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 1)); + // (r12, r12, r02, r01) + t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1, 2, 2, 2)); + // (r12, r02, r01, r12) + t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0)); + // (4*x*w, 4*y*w, 4*z*w, unused) + XMVECTOR xwywzw = _mm_sub_ps(t0, t1); + xwywzw = _mm_mul_ps(XMMPMP, xwywzw); + + // (4*x^2, 4*y^2, 4*x*y, unused) + t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0, 0, 1, 0)); + // (4*z^2, 4*w^2, 4*z*w, unused) + t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0, 2, 3, 2)); + // (4*x*z, 4*y*z, 4*x*w, 4*y*w) + t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1, 0, 2, 1)); + + // (4*x*x, 4*x*y, 4*x*z, 4*x*w) + XMVECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2, 0, 2, 0)); + // (4*y*x, 4*y*y, 4*y*z, 4*y*w) + XMVECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 1, 1, 2)); + // (4*z*x, 4*z*y, 4*z*z, 4*z*w) + XMVECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2, 0, 1, 0)); + // (4*w*x, 4*w*y, 4*w*z, 4*w*w) + XMVECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1, 2, 3, 2)); + + // Select the row of the tensor-product matrix that has the largest + // magnitude. + t0 = _mm_and_ps(x2gey2, tensor0); + t1 = _mm_andnot_ps(x2gey2, tensor1); + t0 = _mm_or_ps(t0, t1); + t1 = _mm_and_ps(z2gew2, tensor2); + t2 = _mm_andnot_ps(z2gew2, tensor3); + t1 = _mm_or_ps(t1, t2); + t0 = _mm_and_ps(x2py2gez2pw2, t0); + t1 = _mm_andnot_ps(x2py2gez2pw2, t1); + t2 = _mm_or_ps(t0, t1); + + // Normalize the row. No division by zero is possible because the + // quaternion is unit-length (and the row is a nonzero multiple of + // the quaternion). + t0 = XMVector4Length(t2); + return _mm_div_ps(t2, t0); +#endif +} + +//------------------------------------------------------------------------------ +// Conversion operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMQuaternionToAxisAngle +( + XMVECTOR* pAxis, + float* pAngle, + FXMVECTOR Q +) noexcept +{ + assert(pAxis); + assert(pAngle); + + *pAxis = Q; + + *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q)); +} + +/**************************************************************************** + * + * Plane + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneEqual +( + FXMVECTOR P1, + FXMVECTOR P2 +) noexcept +{ + return XMVector4Equal(P1, P2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneNearEqual +( + FXMVECTOR P1, + FXMVECTOR P2, + FXMVECTOR Epsilon +) noexcept +{ + XMVECTOR NP1 = XMPlaneNormalize(P1); + XMVECTOR NP2 = XMPlaneNormalize(P2); + return XMVector4NearEqual(NP1, NP2, Epsilon); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneNotEqual +( + FXMVECTOR P1, + FXMVECTOR P2 +) noexcept +{ + return XMVector4NotEqual(P1, P2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneIsNaN(FXMVECTOR P) noexcept +{ + return XMVector4IsNaN(P); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneIsInfinite(FXMVECTOR P) noexcept +{ + return XMVector4IsInfinite(P); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneDot +( + FXMVECTOR P, + FXMVECTOR V +) noexcept +{ + return XMVector4Dot(P, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneDotCoord +( + FXMVECTOR P, + FXMVECTOR V +) noexcept +{ + // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3] + + XMVECTOR V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v); + XMVECTOR Result = XMVector4Dot(P, V3); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneDotNormal +( + FXMVECTOR P, + FXMVECTOR V +) noexcept +{ + return XMVector3Dot(P, V); +} + +//------------------------------------------------------------------------------ +// XMPlaneNormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst(FXMVECTOR P) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Result = XMVector3ReciprocalLengthEst(P); + return XMVectorMultiply(P, Result); + +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(P, P, 0x7f); + XMVECTOR vResult = _mm_rsqrt_ps(vTemp); + return _mm_mul_ps(vResult, P); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(P, P); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot, vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot, vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the reciprocal + vDot = _mm_rsqrt_ps(vDot); + // Get the reciprocal + vDot = _mm_mul_ps(vDot, P); + return vDot; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneNormalize(FXMVECTOR P) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float fLengthSq = sqrtf((P.vector4_f32[0] * P.vector4_f32[0]) + (P.vector4_f32[1] * P.vector4_f32[1]) + (P.vector4_f32[2] * P.vector4_f32[2])); + // Prevent divide by zero + if (fLengthSq > 0) + { + fLengthSq = 1.0f / fLengthSq; + } + XMVECTORF32 vResult = { { { + P.vector4_f32[0] * fLengthSq, + P.vector4_f32[1] * fLengthSq, + P.vector4_f32[2] * fLengthSq, + P.vector4_f32[3] * fLengthSq + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vLength = XMVector3ReciprocalLength(P); + return XMVectorMultiply(P, vLength); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps(P, P, 0x7f); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(P, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vLengthSq); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(P, P); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(P, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vLengthSq); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneIntersectLine +( + FXMVECTOR P, + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2 +) noexcept +{ + XMVECTOR V1 = XMVector3Dot(P, LinePoint1); + XMVECTOR V2 = XMVector3Dot(P, LinePoint2); + XMVECTOR D = XMVectorSubtract(V1, V2); + + XMVECTOR VT = XMPlaneDotCoord(P, LinePoint1); + VT = XMVectorDivide(VT, D); + + XMVECTOR Point = XMVectorSubtract(LinePoint2, LinePoint1); + Point = XMVectorMultiplyAdd(Point, VT, LinePoint1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v); + + return XMVectorSelect(Point, g_XMQNaN.v, Control); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMPlaneIntersectPlane +( + XMVECTOR* pLinePoint1, + XMVECTOR* pLinePoint2, + FXMVECTOR P1, + FXMVECTOR P2 +) noexcept +{ + assert(pLinePoint1); + assert(pLinePoint2); + + XMVECTOR V1 = XMVector3Cross(P2, P1); + + XMVECTOR LengthSq = XMVector3LengthSq(V1); + + XMVECTOR V2 = XMVector3Cross(P2, V1); + + XMVECTOR P1W = XMVectorSplatW(P1); + XMVECTOR Point = XMVectorMultiply(V2, P1W); + + XMVECTOR V3 = XMVector3Cross(V1, P1); + + XMVECTOR P2W = XMVectorSplatW(P2); + Point = XMVectorMultiplyAdd(V3, P2W, Point); + + XMVECTOR LinePoint1 = XMVectorDivide(Point, LengthSq); + + XMVECTOR LinePoint2 = XMVectorAdd(LinePoint1, V1); + + XMVECTOR Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v); + *pLinePoint1 = XMVectorSelect(LinePoint1, g_XMQNaN.v, Control); + *pLinePoint2 = XMVectorSelect(LinePoint2, g_XMQNaN.v, Control); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneTransform +( + FXMVECTOR P, + FXMMATRIX ITM +) noexcept +{ + XMVECTOR W = XMVectorSplatW(P); + XMVECTOR Z = XMVectorSplatZ(P); + XMVECTOR Y = XMVectorSplatY(P); + XMVECTOR X = XMVectorSplatX(P); + + XMVECTOR Result = XMVectorMultiply(W, ITM.r[3]); + Result = XMVectorMultiplyAdd(Z, ITM.r[2], Result); + Result = XMVectorMultiplyAdd(Y, ITM.r[1], Result); + Result = XMVectorMultiplyAdd(X, ITM.r[0], Result); + return Result; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMPlaneTransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT4* pInputStream, + size_t InputStride, + size_t PlaneCount, + FXMMATRIX ITM +) noexcept +{ + return XMVector4TransformStream(pOutputStream, + OutputStride, + pInputStream, + InputStride, + PlaneCount, + ITM); +} + +//------------------------------------------------------------------------------ +// Conversion operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneFromPointNormal +( + FXMVECTOR Point, + FXMVECTOR Normal +) noexcept +{ + XMVECTOR W = XMVector3Dot(Point, Normal); + W = XMVectorNegate(W); + return XMVectorSelect(W, Normal, g_XMSelect1110.v); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneFromPoints +( + FXMVECTOR Point1, + FXMVECTOR Point2, + FXMVECTOR Point3 +) noexcept +{ + XMVECTOR V21 = XMVectorSubtract(Point1, Point2); + XMVECTOR V31 = XMVectorSubtract(Point1, Point3); + + XMVECTOR N = XMVector3Cross(V21, V31); + N = XMVector3Normalize(N); + + XMVECTOR D = XMPlaneDotNormal(N, Point1); + D = XMVectorNegate(D); + + XMVECTOR Result = XMVectorSelect(D, N, g_XMSelect1110.v); + + return Result; +} + +/**************************************************************************** + * + * Color + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4Equal(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorNotEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4NotEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorGreater +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4Greater(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorGreaterOrEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4GreaterOrEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorLess +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4Less(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorLessOrEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4LessOrEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorIsNaN(FXMVECTOR C) noexcept +{ + return XMVector4IsNaN(C); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorIsInfinite(FXMVECTOR C) noexcept +{ + return XMVector4IsInfinite(C); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR vColor) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + 1.0f - vColor.vector4_f32[0], + 1.0f - vColor.vector4_f32[1], + 1.0f - vColor.vector4_f32[2], + vColor.vector4_f32[3] + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vTemp = veorq_u32(vreinterpretq_u32_f32(vColor), g_XMNegate3); + return vaddq_f32(vreinterpretq_f32_u32(vTemp), g_XMOne3); +#elif defined(_XM_SSE_INTRINSICS_) + // Negate only x,y and z. + XMVECTOR vTemp = _mm_xor_ps(vColor, g_XMNegate3); + // Add 1,1,1,0 to -x,-y,-z,w + return _mm_add_ps(vTemp, g_XMOne3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorModulate +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVectorMultiply(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorAdjustSaturation +( + FXMVECTOR vColor, + float fSaturation +) noexcept +{ + // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2]; + // Result = (C - Luminance) * Saturation + Luminance; + + const XMVECTORF32 gvLuminance = { { { 0.2125f, 0.7154f, 0.0721f, 0.0f } } }; +#if defined(_XM_NO_INTRINSICS_) + float fLuminance = (vColor.vector4_f32[0] * gvLuminance.f[0]) + (vColor.vector4_f32[1] * gvLuminance.f[1]) + (vColor.vector4_f32[2] * gvLuminance.f[2]); + XMVECTOR vResult; + vResult.vector4_f32[0] = ((vColor.vector4_f32[0] - fLuminance) * fSaturation) + fLuminance; + vResult.vector4_f32[1] = ((vColor.vector4_f32[1] - fLuminance) * fSaturation) + fLuminance; + vResult.vector4_f32[2] = ((vColor.vector4_f32[2] - fLuminance) * fSaturation) + fLuminance; + vResult.vector4_f32[3] = vColor.vector4_f32[3]; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vLuminance = XMVector3Dot(vColor, gvLuminance); + XMVECTOR vResult = vsubq_f32(vColor, vLuminance); + vResult = vmlaq_n_f32(vLuminance, vResult, fSaturation); + return vbslq_f32(g_XMSelect1110, vResult, vColor); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vLuminance = XMVector3Dot(vColor, gvLuminance); + // Splat fSaturation + XMVECTOR vSaturation = _mm_set_ps1(fSaturation); + // vResult = ((vColor-vLuminance)*vSaturation)+vLuminance; + XMVECTOR vResult = _mm_sub_ps(vColor, vLuminance); + vResult = XM_FMADD_PS(vResult, vSaturation, vLuminance); + // Retain w from the source color + vLuminance = _mm_shuffle_ps(vResult, vColor, _MM_SHUFFLE(3, 2, 2, 2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w + vResult = _mm_shuffle_ps(vResult, vLuminance, _MM_SHUFFLE(3, 0, 1, 0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorAdjustContrast +( + FXMVECTOR vColor, + float fContrast +) noexcept +{ + // Result = (vColor - 0.5f) * fContrast + 0.5f; + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + ((vColor.vector4_f32[0] - 0.5f) * fContrast) + 0.5f, + ((vColor.vector4_f32[1] - 0.5f) * fContrast) + 0.5f, + ((vColor.vector4_f32[2] - 0.5f) * fContrast) + 0.5f, + vColor.vector4_f32[3] // Leave W untouched + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vResult = vsubq_f32(vColor, g_XMOneHalf.v); + vResult = vmlaq_n_f32(g_XMOneHalf.v, vResult, fContrast); + return vbslq_f32(g_XMSelect1110, vResult, vColor); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vScale = _mm_set_ps1(fContrast); // Splat the scale + XMVECTOR vResult = _mm_sub_ps(vColor, g_XMOneHalf); // Subtract 0.5f from the source (Saving source) + vResult = XM_FMADD_PS(vResult, vScale, g_XMOneHalf); +// Retain w from the source color + vScale = _mm_shuffle_ps(vResult, vColor, _MM_SHUFFLE(3, 2, 2, 2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w + vResult = _mm_shuffle_ps(vResult, vScale, _MM_SHUFFLE(3, 0, 1, 0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToHSL(FXMVECTOR rgb) noexcept +{ + XMVECTOR r = XMVectorSplatX(rgb); + XMVECTOR g = XMVectorSplatY(rgb); + XMVECTOR b = XMVectorSplatZ(rgb); + + XMVECTOR min = XMVectorMin(r, XMVectorMin(g, b)); + XMVECTOR max = XMVectorMax(r, XMVectorMax(g, b)); + + XMVECTOR l = XMVectorMultiply(XMVectorAdd(min, max), g_XMOneHalf); + + XMVECTOR d = XMVectorSubtract(max, min); + + XMVECTOR la = XMVectorSelect(rgb, l, g_XMSelect1110); + + if (XMVector3Less(d, g_XMEpsilon)) + { + // Achromatic, assume H and S of 0 + return XMVectorSelect(la, g_XMZero, g_XMSelect1100); + } + else + { + XMVECTOR s, h; + + XMVECTOR d2 = XMVectorAdd(min, max); + + if (XMVector3Greater(l, g_XMOneHalf)) + { + // d / (2-max-min) + s = XMVectorDivide(d, XMVectorSubtract(g_XMTwo, d2)); + } + else + { + // d / (max+min) + s = XMVectorDivide(d, d2); + } + + if (XMVector3Equal(r, max)) + { + // Red is max + h = XMVectorDivide(XMVectorSubtract(g, b), d); + } + else if (XMVector3Equal(g, max)) + { + // Green is max + h = XMVectorDivide(XMVectorSubtract(b, r), d); + h = XMVectorAdd(h, g_XMTwo); + } + else + { + // Blue is max + h = XMVectorDivide(XMVectorSubtract(r, g), d); + h = XMVectorAdd(h, g_XMFour); + } + + h = XMVectorDivide(h, g_XMSix); + + if (XMVector3Less(h, g_XMZero)) + h = XMVectorAdd(h, g_XMOne); + + XMVECTOR lha = XMVectorSelect(la, h, g_XMSelect1100); + return XMVectorSelect(s, lha, g_XMSelect1011); + } +} + +//------------------------------------------------------------------------------ + +namespace MathInternal +{ + + inline XMVECTOR XM_CALLCONV XMColorHue2Clr(FXMVECTOR p, FXMVECTOR q, FXMVECTOR h) noexcept + { + static const XMVECTORF32 oneSixth = { { { 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f } } }; + static const XMVECTORF32 twoThirds = { { { 2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f } } }; + + XMVECTOR t = h; + + if (XMVector3Less(t, g_XMZero)) + t = XMVectorAdd(t, g_XMOne); + + if (XMVector3Greater(t, g_XMOne)) + t = XMVectorSubtract(t, g_XMOne); + + if (XMVector3Less(t, oneSixth)) + { + // p + (q - p) * 6 * t + XMVECTOR t1 = XMVectorSubtract(q, p); + XMVECTOR t2 = XMVectorMultiply(g_XMSix, t); + return XMVectorMultiplyAdd(t1, t2, p); + } + + if (XMVector3Less(t, g_XMOneHalf)) + return q; + + if (XMVector3Less(t, twoThirds)) + { + // p + (q - p) * 6 * (2/3 - t) + XMVECTOR t1 = XMVectorSubtract(q, p); + XMVECTOR t2 = XMVectorMultiply(g_XMSix, XMVectorSubtract(twoThirds, t)); + return XMVectorMultiplyAdd(t1, t2, p); + } + + return p; + } + +} // namespace MathInternal + +inline XMVECTOR XM_CALLCONV XMColorHSLToRGB(FXMVECTOR hsl) noexcept +{ + static const XMVECTORF32 oneThird = { { { 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f } } }; + + XMVECTOR s = XMVectorSplatY(hsl); + XMVECTOR l = XMVectorSplatZ(hsl); + + if (XMVector3NearEqual(s, g_XMZero, g_XMEpsilon)) + { + // Achromatic + return XMVectorSelect(hsl, l, g_XMSelect1110); + } + else + { + XMVECTOR h = XMVectorSplatX(hsl); + + XMVECTOR q; + if (XMVector3Less(l, g_XMOneHalf)) + { + q = XMVectorMultiply(l, XMVectorAdd(g_XMOne, s)); + } + else + { + q = XMVectorSubtract(XMVectorAdd(l, s), XMVectorMultiply(l, s)); + } + + XMVECTOR p = XMVectorSubtract(XMVectorMultiply(g_XMTwo, l), q); + + XMVECTOR r = MathInternal::XMColorHue2Clr(p, q, XMVectorAdd(h, oneThird)); + XMVECTOR g = MathInternal::XMColorHue2Clr(p, q, h); + XMVECTOR b = MathInternal::XMColorHue2Clr(p, q, XMVectorSubtract(h, oneThird)); + + XMVECTOR rg = XMVectorSelect(g, r, g_XMSelect1000); + XMVECTOR ba = XMVectorSelect(hsl, b, g_XMSelect1110); + + return XMVectorSelect(ba, rg, g_XMSelect1100); + } +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToHSV(FXMVECTOR rgb) noexcept +{ + XMVECTOR r = XMVectorSplatX(rgb); + XMVECTOR g = XMVectorSplatY(rgb); + XMVECTOR b = XMVectorSplatZ(rgb); + + XMVECTOR min = XMVectorMin(r, XMVectorMin(g, b)); + XMVECTOR v = XMVectorMax(r, XMVectorMax(g, b)); + + XMVECTOR d = XMVectorSubtract(v, min); + + XMVECTOR s = (XMVector3NearEqual(v, g_XMZero, g_XMEpsilon)) ? g_XMZero : XMVectorDivide(d, v); + + if (XMVector3Less(d, g_XMEpsilon)) + { + // Achromatic, assume H of 0 + XMVECTOR hv = XMVectorSelect(v, g_XMZero, g_XMSelect1000); + XMVECTOR hva = XMVectorSelect(rgb, hv, g_XMSelect1110); + return XMVectorSelect(s, hva, g_XMSelect1011); + } + else + { + XMVECTOR h; + + if (XMVector3Equal(r, v)) + { + // Red is max + h = XMVectorDivide(XMVectorSubtract(g, b), d); + + if (XMVector3Less(g, b)) + h = XMVectorAdd(h, g_XMSix); + } + else if (XMVector3Equal(g, v)) + { + // Green is max + h = XMVectorDivide(XMVectorSubtract(b, r), d); + h = XMVectorAdd(h, g_XMTwo); + } + else + { + // Blue is max + h = XMVectorDivide(XMVectorSubtract(r, g), d); + h = XMVectorAdd(h, g_XMFour); + } + + h = XMVectorDivide(h, g_XMSix); + + XMVECTOR hv = XMVectorSelect(v, h, g_XMSelect1000); + XMVECTOR hva = XMVectorSelect(rgb, hv, g_XMSelect1110); + return XMVectorSelect(s, hva, g_XMSelect1011); + } +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorHSVToRGB(FXMVECTOR hsv) noexcept +{ + XMVECTOR h = XMVectorSplatX(hsv); + XMVECTOR s = XMVectorSplatY(hsv); + XMVECTOR v = XMVectorSplatZ(hsv); + + XMVECTOR h6 = XMVectorMultiply(h, g_XMSix); + + XMVECTOR i = XMVectorFloor(h6); + XMVECTOR f = XMVectorSubtract(h6, i); + + // p = v* (1-s) + XMVECTOR p = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, s)); + + // q = v*(1-f*s) + XMVECTOR q = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, XMVectorMultiply(f, s))); + + // t = v*(1 - (1-f)*s) + XMVECTOR t = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, XMVectorMultiply(XMVectorSubtract(g_XMOne, f), s))); + + auto ii = static_cast(XMVectorGetX(XMVectorMod(i, g_XMSix))); + + XMVECTOR _rgb; + + switch (ii) + { + case 0: // rgb = vtp + { + XMVECTOR vt = XMVectorSelect(t, v, g_XMSelect1000); + _rgb = XMVectorSelect(p, vt, g_XMSelect1100); + } + break; + case 1: // rgb = qvp + { + XMVECTOR qv = XMVectorSelect(v, q, g_XMSelect1000); + _rgb = XMVectorSelect(p, qv, g_XMSelect1100); + } + break; + case 2: // rgb = pvt + { + XMVECTOR pv = XMVectorSelect(v, p, g_XMSelect1000); + _rgb = XMVectorSelect(t, pv, g_XMSelect1100); + } + break; + case 3: // rgb = pqv + { + XMVECTOR pq = XMVectorSelect(q, p, g_XMSelect1000); + _rgb = XMVectorSelect(v, pq, g_XMSelect1100); + } + break; + case 4: // rgb = tpv + { + XMVECTOR tp = XMVectorSelect(p, t, g_XMSelect1000); + _rgb = XMVectorSelect(v, tp, g_XMSelect1100); + } + break; + default: // rgb = vpq + { + XMVECTOR vp = XMVectorSelect(p, v, g_XMSelect1000); + _rgb = XMVectorSelect(q, vp, g_XMSelect1100); + } + break; + } + + return XMVectorSelect(hsv, _rgb, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToYUV(FXMVECTOR rgb) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 0.299f, -0.147f, 0.615f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { 0.587f, -0.289f, -0.515f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 0.114f, 0.436f, -0.100f, 0.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(rgb, M); + + return XMVectorSelect(rgb, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorYUVToRGB(FXMVECTOR yuv) noexcept +{ + static const XMVECTORF32 Scale1 = { { { 0.0f, -0.395f, 2.032f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 1.140f, -0.581f, 0.0f, 0.0f } } }; + + XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(yuv, M); + + return XMVectorSelect(yuv, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD(FXMVECTOR rgb) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 0.2126f, -0.0997f, 0.6150f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { 0.7152f, -0.3354f, -0.5586f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 0.0722f, 0.4351f, -0.0564f, 0.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(rgb, M); + + return XMVectorSelect(rgb, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD(FXMVECTOR yuv) noexcept +{ + static const XMVECTORF32 Scale1 = { { { 0.0f, -0.2153f, 2.1324f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 1.2803f, -0.3806f, 0.0f, 0.0f } } }; + + XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(yuv, M); + + return XMVectorSelect(yuv, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_UHD(FXMVECTOR rgb) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 0.2627f, -0.1215f, 0.6150f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { 0.6780f, -0.3136f, -0.5655f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 0.0593f, 0.4351f, -0.0495f, 0.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(rgb, M); + + return XMVectorSelect(rgb, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_UHD(FXMVECTOR yuv) noexcept +{ + static const XMVECTORF32 Scale1 = { { { 0.0f, -0.1891f, 2.1620f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 1.1989f, -0.4645f, 0.0f, 0.0f } } }; + + XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(yuv, M); + + return XMVectorSelect(yuv, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToXYZ(FXMVECTOR rgb) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 0.4887180f, 0.1762044f, 0.0000000f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { 0.3106803f, 0.8129847f, 0.0102048f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 0.2006017f, 0.0108109f, 0.9897952f, 0.0f } } }; + static const XMVECTORF32 Scale = { { { 1.f / 0.17697f, 1.f / 0.17697f, 1.f / 0.17697f, 0.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVectorMultiply(XMVector3Transform(rgb, M), Scale); + + return XMVectorSelect(rgb, clr, g_XMSelect1110); +} + +inline XMVECTOR XM_CALLCONV XMColorXYZToRGB(FXMVECTOR xyz) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 2.3706743f, -0.5138850f, 0.0052982f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { -0.9000405f, 1.4253036f, -0.0146949f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { -0.4706338f, 0.0885814f, 1.0093968f, 0.0f } } }; + static const XMVECTORF32 Scale = { { { 0.17697f, 0.17697f, 0.17697f, 0.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(XMVectorMultiply(xyz, Scale), M); + + return XMVectorSelect(xyz, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorXYZToSRGB(FXMVECTOR xyz) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 3.2406f, -0.9689f, 0.0557f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { -1.5372f, 1.8758f, -0.2040f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { -0.4986f, 0.0415f, 1.0570f, 0.0f } } }; + static const XMVECTORF32 Cutoff = { { { 0.0031308f, 0.0031308f, 0.0031308f, 0.0f } } }; + static const XMVECTORF32 Exp = { { { 1.0f / 2.4f, 1.0f / 2.4f, 1.0f / 2.4f, 1.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR lclr = XMVector3Transform(xyz, M); + + XMVECTOR sel = XMVectorGreater(lclr, Cutoff); + + // clr = 12.92 * lclr for lclr <= 0.0031308f + XMVECTOR smallC = XMVectorMultiply(lclr, g_XMsrgbScale); + + // clr = (1+a)*pow(lclr, 1/2.4) - a for lclr > 0.0031308 (where a = 0.055) + XMVECTOR largeC = XMVectorSubtract(XMVectorMultiply(g_XMsrgbA1, XMVectorPow(lclr, Exp)), g_XMsrgbA); + + XMVECTOR clr = XMVectorSelect(smallC, largeC, sel); + + return XMVectorSelect(xyz, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorSRGBToXYZ(FXMVECTOR srgb) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 0.4124f, 0.2126f, 0.0193f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { 0.3576f, 0.7152f, 0.1192f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 0.1805f, 0.0722f, 0.9505f, 0.0f } } }; + static const XMVECTORF32 Cutoff = { { { 0.04045f, 0.04045f, 0.04045f, 0.0f } } }; + static const XMVECTORF32 Exp = { { { 2.4f, 2.4f, 2.4f, 1.0f } } }; + + XMVECTOR sel = XMVectorGreater(srgb, Cutoff); + + // lclr = clr / 12.92 + XMVECTOR smallC = XMVectorDivide(srgb, g_XMsrgbScale); + + // lclr = pow( (clr + a) / (1+a), 2.4 ) + XMVECTOR largeC = XMVectorPow(XMVectorDivide(XMVectorAdd(srgb, g_XMsrgbA), g_XMsrgbA1), Exp); + + XMVECTOR lclr = XMVectorSelect(smallC, largeC, sel); + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(lclr, M); + + return XMVectorSelect(srgb, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToSRGB(FXMVECTOR rgb) noexcept +{ + static const XMVECTORF32 Cutoff = { { { 0.0031308f, 0.0031308f, 0.0031308f, 1.f } } }; + static const XMVECTORF32 Linear = { { { 12.92f, 12.92f, 12.92f, 1.f } } }; + static const XMVECTORF32 Scale = { { { 1.055f, 1.055f, 1.055f, 1.f } } }; + static const XMVECTORF32 Bias = { { { 0.055f, 0.055f, 0.055f, 0.f } } }; + static const XMVECTORF32 InvGamma = { { { 1.0f / 2.4f, 1.0f / 2.4f, 1.0f / 2.4f, 1.f } } }; + + XMVECTOR V = XMVectorSaturate(rgb); + XMVECTOR V0 = XMVectorMultiply(V, Linear); + XMVECTOR V1 = XMVectorSubtract(XMVectorMultiply(Scale, XMVectorPow(V, InvGamma)), Bias); + XMVECTOR select = XMVectorLess(V, Cutoff); + V = XMVectorSelect(V1, V0, select); + return XMVectorSelect(rgb, V, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB(FXMVECTOR srgb) noexcept +{ + static const XMVECTORF32 Cutoff = { { { 0.04045f, 0.04045f, 0.04045f, 1.f } } }; + static const XMVECTORF32 ILinear = { { { 1.f / 12.92f, 1.f / 12.92f, 1.f / 12.92f, 1.f } } }; + static const XMVECTORF32 Scale = { { { 1.f / 1.055f, 1.f / 1.055f, 1.f / 1.055f, 1.f } } }; + static const XMVECTORF32 Bias = { { { 0.055f, 0.055f, 0.055f, 0.f } } }; + static const XMVECTORF32 Gamma = { { { 2.4f, 2.4f, 2.4f, 1.f } } }; + + XMVECTOR V = XMVectorSaturate(srgb); + XMVECTOR V0 = XMVectorMultiply(V, ILinear); + XMVECTOR V1 = XMVectorPow(XMVectorMultiply(XMVectorAdd(V, Bias), Scale), Gamma); + XMVECTOR select = XMVectorGreater(V, Cutoff); + V = XMVectorSelect(V0, V1, select); + return XMVectorSelect(srgb, V, g_XMSelect1110); +} + +/**************************************************************************** + * + * Miscellaneous + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline bool XMVerifyCPUSupport() noexcept +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(__powerpc64__) && !defined(_XM_NO_INTRINSICS_) + int CPUInfo[4] = { -1 }; +#if (defined(__clang__) || defined(__GNUC__)) && defined(__cpuid) + __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 0); +#endif + +#ifdef __AVX2__ + if (CPUInfo[0] < 7) + return false; +#else + if (CPUInfo[0] < 1) + return false; +#endif + +#if (defined(__clang__) || defined(__GNUC__)) && defined(__cpuid) + __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 1); +#endif + +#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_) + // The compiler can emit FMA3 instructions even without explicit intrinsics use + if ((CPUInfo[2] & 0x38081001) != 0x38081001) + return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support +#elif defined(_XM_FMA3_INTRINSICS_) && defined(_XM_F16C_INTRINSICS_) + if ((CPUInfo[2] & 0x38081001) != 0x38081001) + return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support +#elif defined(_XM_FMA3_INTRINSICS_) + if ((CPUInfo[2] & 0x18081001) != 0x18081001) + return false; // No AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support +#elif defined(_XM_F16C_INTRINSICS_) + if ((CPUInfo[2] & 0x38080001) != 0x38080001) + return false; // No F16C/AVX/OSXSAVE/SSE4.1/SSE3 support +#elif defined(__AVX__) || defined(_XM_AVX_INTRINSICS_) + if ((CPUInfo[2] & 0x18080001) != 0x18080001) + return false; // No AVX/OSXSAVE/SSE4.1/SSE3 support +#elif defined(_XM_SSE4_INTRINSICS_) + if ((CPUInfo[2] & 0x80001) != 0x80001) + return false; // No SSE3/SSE4.1 support +#elif defined(_XM_SSE3_INTRINSICS_) + if (!(CPUInfo[2] & 0x1)) + return false; // No SSE3 support +#endif + + // The x64 processor model requires SSE2 support, but no harm in checking + if ((CPUInfo[3] & 0x6000000) != 0x6000000) + return false; // No SSE2/SSE support + +#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_) +#if defined(__clang__) || defined(__GNUC__) + __cpuid_count(7, 0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuidex(CPUInfo, 7, 0); +#endif + if (!(CPUInfo[1] & 0x20)) + return false; // No AVX2 support +#endif + + return true; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + // ARM-NEON support is required for the Windows on ARM platform + return true; +#else + // No intrinsics path always supported + return true; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMFresnelTerm +( + FXMVECTOR CosIncidentAngle, + FXMVECTOR RefractionIndex +) noexcept +{ + assert(!XMVector4IsInfinite(CosIncidentAngle)); + + // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where + // c = CosIncidentAngle + // g = sqrt(c^2 + RefractionIndex^2 - 1) + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v); + G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G); + G = XMVectorAbs(G); + G = XMVectorSqrt(G); + + XMVECTOR S = XMVectorAdd(G, CosIncidentAngle); + XMVECTOR D = XMVectorSubtract(G, CosIncidentAngle); + + XMVECTOR V0 = XMVectorMultiply(D, D); + XMVECTOR V1 = XMVectorMultiply(S, S); + V1 = XMVectorReciprocal(V1); + V0 = XMVectorMultiply(g_XMOneHalf.v, V0); + V0 = XMVectorMultiply(V0, V1); + + XMVECTOR V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v); + XMVECTOR V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v); + V2 = XMVectorMultiply(V2, V2); + V3 = XMVectorMultiply(V3, V3); + V3 = XMVectorReciprocal(V3); + V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v); + + XMVECTOR Result = XMVectorMultiply(V0, V2); + + Result = XMVectorSaturate(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2)) + XMVECTOR G = _mm_mul_ps(RefractionIndex, RefractionIndex); + XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle, CosIncidentAngle); + G = _mm_sub_ps(G, g_XMOne); + vTemp = _mm_add_ps(vTemp, G); + // max((0-vTemp),vTemp) == abs(vTemp) + // The abs is needed to deal with refraction and cosine being zero + G = _mm_setzero_ps(); + G = _mm_sub_ps(G, vTemp); + G = _mm_max_ps(G, vTemp); + // Last operation, the sqrt() + G = _mm_sqrt_ps(G); + + // Calc G-C and G+C + XMVECTOR GAddC = _mm_add_ps(G, CosIncidentAngle); + XMVECTOR GSubC = _mm_sub_ps(G, CosIncidentAngle); + // Perform the term (0.5f *(g - c)^2) / (g + c)^2 + XMVECTOR vResult = _mm_mul_ps(GSubC, GSubC); + vTemp = _mm_mul_ps(GAddC, GAddC); + vResult = _mm_mul_ps(vResult, g_XMOneHalf); + vResult = _mm_div_ps(vResult, vTemp); + // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) + GAddC = _mm_mul_ps(GAddC, CosIncidentAngle); + GSubC = _mm_mul_ps(GSubC, CosIncidentAngle); + GAddC = _mm_sub_ps(GAddC, g_XMOne); + GSubC = _mm_add_ps(GSubC, g_XMOne); + GAddC = _mm_mul_ps(GAddC, GAddC); + GSubC = _mm_mul_ps(GSubC, GSubC); + GAddC = _mm_div_ps(GAddC, GSubC); + GAddC = _mm_add_ps(GAddC, g_XMOne); + // Multiply the two term parts + vResult = _mm_mul_ps(vResult, GAddC); + // Clamp to 0.0 - 1.0f + vResult = _mm_max_ps(vResult, g_XMZero); + vResult = _mm_min_ps(vResult, g_XMOne); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMScalarNearEqual +( + float S1, + float S2, + float Epsilon +) noexcept +{ + float Delta = S1 - S2; + return (fabsf(Delta) <= Epsilon); +} + +//------------------------------------------------------------------------------ +// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI +inline float XMScalarModAngle(float Angle) noexcept +{ + // Note: The modulo is performed with unsigned math only to work + // around a precision error on numbers that are close to PI + + // Normalize the range from 0.0f to XM_2PI + Angle = Angle + XM_PI; + // Perform the modulo, unsigned + float fTemp = fabsf(Angle); + fTemp = fTemp - (XM_2PI * static_cast(static_cast(fTemp / XM_2PI))); + // Restore the number to the range of -XM_PI to XM_PI-epsilon + fTemp = fTemp - XM_PI; + // If the modulo'd value was negative, restore negation + if (Angle < 0.0f) + { + fTemp = -fTemp; + } + return fTemp; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarSin(float Value) noexcept +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + if (y > XM_PIDIV2) + { + y = XM_PI - y; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + } + + // 11-degree minimax approximation + float y2 = y * y; + return (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarSinEst(float Value) noexcept +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + if (y > XM_PIDIV2) + { + y = XM_PI - y; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + } + + // 7-degree minimax approximation + float y2 = y * y; + return (((-0.00018524670f * y2 + 0.0083139502f) * y2 - 0.16665852f) * y2 + 1.0f) * y; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarCos(float Value) noexcept +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + // 10-degree minimax approximation + float y2 = y * y; + float p = ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f; + return sign * p; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarCosEst(float Value) noexcept +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + // 6-degree minimax approximation + float y2 = y * y; + float p = ((-0.0012712436f * y2 + 0.041493919f) * y2 - 0.49992746f) * y2 + 1.0f; + return sign * p; +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XMScalarSinCos +( + float* pSin, + float* pCos, + float Value +) noexcept +{ + assert(pSin); + assert(pCos); + + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + float y2 = y * y; + + // 11-degree minimax approximation + *pSin = (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y; + + // 10-degree minimax approximation + float p = ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f; + *pCos = sign * p; +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XMScalarSinCosEst +( + float* pSin, + float* pCos, + float Value +) noexcept +{ + assert(pSin); + assert(pCos); + + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + float y2 = y * y; + + // 7-degree minimax approximation + *pSin = (((-0.00018524670f * y2 + 0.0083139502f) * y2 - 0.16665852f) * y2 + 1.0f) * y; + + // 6-degree minimax approximation + float p = ((-0.0012712436f * y2 + 0.041493919f) * y2 - 0.49992746f) * y2 + 1.0f; + *pCos = sign * p; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarASin(float Value) noexcept +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 7-degree minimax approximation + float result = ((((((-0.0012624911f * x + 0.0066700901f) * x - 0.0170881256f) * x + 0.0308918810f) * x - 0.0501743046f) * x + 0.0889789874f) * x - 0.2145988016f) * x + 1.5707963050f; + result *= root; // acos(|x|) + + // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x) + return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarASinEst(float Value) noexcept +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 3-degree minimax approximation + float result = ((-0.0187293f * x + 0.0742610f) * x - 0.2121144f) * x + 1.5707288f; + result *= root; // acos(|x|) + + // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x) + return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarACos(float Value) noexcept +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 7-degree minimax approximation + float result = ((((((-0.0012624911f * x + 0.0066700901f) * x - 0.0170881256f) * x + 0.0308918810f) * x - 0.0501743046f) * x + 0.0889789874f) * x - 0.2145988016f) * x + 1.5707963050f; + result *= root; + + // acos(x) = pi - acos(-x) when x < 0 + return (nonnegative ? result : XM_PI - result); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarACosEst(float Value) noexcept +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 3-degree minimax approximation + float result = ((-0.0187293f * x + 0.0742610f) * x - 0.2121144f) * x + 1.5707288f; + result *= root; + + // acos(x) = pi - acos(-x) when x < 0 + return (nonnegative ? result : XM_PI - result); +} diff --git a/include/directxmath/directxmathvector.inl b/include/directxmath/directxmathvector.inl new file mode 100644 index 0000000..2b0ce98 --- /dev/null +++ b/include/directxmath/directxmathvector.inl @@ -0,0 +1,14869 @@ +//------------------------------------------------------------------------------------- +// DirectXMathVector.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// https://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#if defined(_XM_NO_INTRINSICS_) +#define XMISNAN(x) isnan(x) +#define XMISINF(x) isinf(x) +#endif + +#if defined(_XM_SSE_INTRINSICS_) + +#define XM3UNPACK3INTO4(l1, l2, l3) \ + XMVECTOR V3 = _mm_shuffle_ps(l2, l3, _MM_SHUFFLE(0, 0, 3, 2));\ + XMVECTOR V2 = _mm_shuffle_ps(l2, l1, _MM_SHUFFLE(3, 3, 1, 0));\ + V2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 0, 2));\ + XMVECTOR V4 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(L3), 32 / 8)) + +#define XM3PACK4INTO3(v2x) \ + v2x = _mm_shuffle_ps(V2, V3, _MM_SHUFFLE(1, 0, 2, 1));\ + V2 = _mm_shuffle_ps(V2, V1, _MM_SHUFFLE(2, 2, 0, 0));\ + V1 = _mm_shuffle_ps(V1, V2, _MM_SHUFFLE(0, 2, 1, 0));\ + V3 = _mm_shuffle_ps(V3, V4, _MM_SHUFFLE(0, 0, 2, 2));\ + V3 = _mm_shuffle_ps(V3, V4, _MM_SHUFFLE(2, 1, 2, 0)) + +#endif + +/**************************************************************************** + * + * General Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Assignment operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Return a vector with all elements equaling zero +inline XMVECTOR XM_CALLCONV XMVectorZero() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { 0.0f, 0.0f, 0.0f, 0.0f } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32(0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_setzero_ps(); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with four floating point values +inline XMVECTOR XM_CALLCONV XMVectorSet +( + float x, + float y, + float z, + float w +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { x, y, z, w } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t V0 = vcreate_f32( + static_cast(*reinterpret_cast(&x)) + | (static_cast(*reinterpret_cast(&y)) << 32)); + float32x2_t V1 = vcreate_f32( + static_cast(*reinterpret_cast(&z)) + | (static_cast(*reinterpret_cast(&w)) << 32)); + return vcombine_f32(V0, V1); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_set_ps(w, z, y, x); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with four integer values +inline XMVECTOR XM_CALLCONV XMVectorSetInt +( + uint32_t x, + uint32_t y, + uint32_t z, + uint32_t w +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult = { { { x, y, z, w } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t V0 = vcreate_u32(static_cast(x) | (static_cast(y) << 32)); + uint32x2_t V1 = vcreate_u32(static_cast(z) | (static_cast(w) << 32)); + return vreinterpretq_f32_u32(vcombine_u32(V0, V1)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set_epi32(static_cast(w), static_cast(z), static_cast(y), static_cast(x)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated floating point value +inline XMVECTOR XM_CALLCONV XMVectorReplicate(float Value) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32(Value); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_set_ps1(Value); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr(const float* pValue) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float Value = pValue[0]; + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_dup_f32(pValue); +#elif defined(_XM_AVX_INTRINSICS_) + return _mm_broadcast_ss(pValue); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps1(pValue); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated integer value +inline XMVECTOR XM_CALLCONV XMVectorReplicateInt(uint32_t Value) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(Value)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_set1_epi32(static_cast(Value)); + return _mm_castsi128_ps(vTemp); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr(const uint32_t* pValue) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t Value = pValue[0]; + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_dup_u32(pValue)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps1(reinterpret_cast(pValue)); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with all bits set (true mask) +inline XMVECTOR XM_CALLCONV XMVectorTrueInt() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult = { { { 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_s32(vdupq_n_s32(-1)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set1_epi32(-1); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with all bits clear (false mask) +inline XMVECTOR XM_CALLCONV XMVectorFalseInt() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { 0.0f, 0.0f, 0.0f, 0.0f } } }; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(0)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_setzero_ps(); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the x component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatX(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[0]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32(vget_low_f32(V), 0); +#elif defined(_XM_AVX2_INTRINSICS_) && defined(_XM_FAVOR_INTEL_) + return _mm_broadcastss_ps(V); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the y component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatY(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[1]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32(vget_low_f32(V), 1); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the z component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatZ(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[2]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32(vget_high_f32(V), 0); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the w component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatW(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[3]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32(vget_high_f32(V), 1); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of 1.0f,1.0f,1.0f,1.0f +inline XMVECTOR XM_CALLCONV XMVectorSplatOne() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = 1.0f; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32(1.0f); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMOne; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of INF,INF,INF,INF +inline XMVECTOR XM_CALLCONV XMVectorSplatInfinity() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x7F800000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(0x7F800000)); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMInfinity; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN +inline XMVECTOR XM_CALLCONV XMVectorSplatQNaN() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x7FC00000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(0x7FC00000)); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMQNaN; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f +inline XMVECTOR XM_CALLCONV XMVectorSplatEpsilon() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x34000000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(0x34000000)); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMEpsilon; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f +inline XMVECTOR XM_CALLCONV XMVectorSplatSignMask() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x80000000U; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(0x80000000U)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set1_epi32(static_cast(0x80000000)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Return a floating point value via an index. This is not a recommended +// function to use due to performance loss. +inline float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i) noexcept +{ + assert(i < 4); + _Analysis_assume_(i < 4); +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[i]; +#else + XMVECTORF32 U; + U.v = V; + return U.f[i]; +#endif +} + +//------------------------------------------------------------------------------ +// Return the X component in an FPU register. +inline float XM_CALLCONV XMVectorGetX(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cvtss_f32(V); +#endif +} + +// Return the Y component in an FPU register. +inline float XM_CALLCONV XMVectorGetY(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + return _mm_cvtss_f32(vTemp); +#endif +} + +// Return the Z component in an FPU register. +inline float XM_CALLCONV XMVectorGetZ(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + return _mm_cvtss_f32(vTemp); +#endif +} + +// Return the W component in an FPU register. +inline float XM_CALLCONV XMVectorGetW(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + return _mm_cvtss_f32(vTemp); +#endif +} + +//------------------------------------------------------------------------------ + +// Store a component indexed by i into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetByIndexPtr(float* f, FXMVECTOR V, size_t i) noexcept +{ + assert(f != nullptr); + assert(i < 4); + _Analysis_assume_(i < 4); +#if defined(_XM_NO_INTRINSICS_) + *f = V.vector4_f32[i]; +#else + XMVECTORF32 U; + U.v = V; + *f = U.f[i]; +#endif +} + +//------------------------------------------------------------------------------ + +// Store the X component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetXPtr(float* x, FXMVECTOR V) noexcept +{ + assert(x != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_f32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(x, V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(x, V); +#endif +} + +// Store the Y component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetYPtr(float* y, FXMVECTOR V) noexcept +{ + assert(y != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(y, V, 1); +#elif defined(_XM_SSE4_INTRINSICS_) + * (reinterpret_cast(y)) = _mm_extract_ps(V, 1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + _mm_store_ss(y, vResult); +#endif +} + +// Store the Z component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetZPtr(float* z, FXMVECTOR V) noexcept +{ + assert(z != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(z, V, 2); +#elif defined(_XM_SSE4_INTRINSICS_) + * (reinterpret_cast(z)) = _mm_extract_ps(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(z, vResult); +#endif +} + +// Store the W component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetWPtr(float* w, FXMVECTOR V) noexcept +{ + assert(w != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(w, V, 3); +#elif defined(_XM_SSE4_INTRINSICS_) + * (reinterpret_cast(w)) = _mm_extract_ps(V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + _mm_store_ss(w, vResult); +#endif +} + +//------------------------------------------------------------------------------ + +// Return an integer value via an index. This is not a recommended +// function to use due to performance loss. +inline uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i) noexcept +{ + assert(i < 4); + _Analysis_assume_(i < 4); +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[i]; +#else + XMVECTORU32 U; + U.v = V; + return U.u[i]; +#endif +} + +//------------------------------------------------------------------------------ + +// Return the X component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(vreinterpretq_u32_f32(V), 0); +#elif defined(_XM_SSE_INTRINSICS_) + return static_cast(_mm_cvtsi128_si32(_mm_castps_si128(V))); +#endif +} + +// Return the Y component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(vreinterpretq_u32_f32(V), 1); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + return static_cast(_mm_extract_epi32(V1, 1)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(1, 1, 1, 1)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#endif +} + +// Return the Z component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(vreinterpretq_u32_f32(V), 2); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + return static_cast(_mm_extract_epi32(V1, 2)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(2, 2, 2, 2)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#endif +} + +// Return the W component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(vreinterpretq_u32_f32(V), 3); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + return static_cast(_mm_extract_epi32(V1, 3)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(3, 3, 3, 3)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ + +// Store a component indexed by i into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntByIndexPtr(uint32_t* x, FXMVECTOR V, size_t i) noexcept +{ + assert(x != nullptr); + assert(i < 4); + _Analysis_assume_(i < 4); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_u32[i]; +#else + XMVECTORU32 U; + U.v = V; + *x = U.u[i]; +#endif +} + +//------------------------------------------------------------------------------ + +// Store the X component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntXPtr(uint32_t* x, FXMVECTOR V) noexcept +{ + assert(x != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_u32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(x, *reinterpret_cast(&V), 0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(reinterpret_cast(x), V); +#endif +} + +// Store the Y component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntYPtr(uint32_t* y, FXMVECTOR V) noexcept +{ + assert(y != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *y = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(y, *reinterpret_cast(&V), 1); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + *y = static_cast(_mm_extract_epi32(V1, 1)); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + _mm_store_ss(reinterpret_cast(y), vResult); +#endif +} + +// Store the Z component into a 32 bit integer locaCantion in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntZPtr(uint32_t* z, FXMVECTOR V) noexcept +{ + assert(z != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *z = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(z, *reinterpret_cast(&V), 2); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + *z = static_cast(_mm_extract_epi32(V1, 2)); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(reinterpret_cast(z), vResult); +#endif +} + +// Store the W component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntWPtr(uint32_t* w, FXMVECTOR V) noexcept +{ + assert(w != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *w = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(w, *reinterpret_cast(&V), 3); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + *w = static_cast(_mm_extract_epi32(V1, 3)); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + _mm_store_ss(reinterpret_cast(w), vResult); +#endif +} + +//------------------------------------------------------------------------------ + +// Set a single indexed floating point component +inline XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i) noexcept +{ + assert(i < 4); + _Analysis_assume_(i < 4); + XMVECTORF32 U; + U.v = V; + U.f[i] = f; + return U.v; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + x, + V.vector4_f32[1], + V.vector4_f32[2], + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(x, V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(x); + vResult = _mm_move_ss(V, vResult); + return vResult; +#endif +} + +// Sets the Y component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + y, + V.vector4_f32[2], + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(y, V, 1); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(y); + vResult = _mm_insert_ps(V, vResult, 0x10); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(y); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1)); + return vResult; +#endif +} +// Sets the Z component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + V.vector4_f32[1], + z, + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(z, V, 2); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(z); + vResult = _mm_insert_ps(V, vResult, 0x20); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(z); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2)); + return vResult; +#endif +} + +// Sets the W component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + V.vector4_f32[1], + V.vector4_f32[2], + w + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(w, V, 3); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(w); + vResult = _mm_insert_ps(V, vResult, 0x30); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(w); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(FXMVECTOR V, const float* f, size_t i) noexcept +{ + assert(f != nullptr); + assert(i < 4); + _Analysis_assume_(i < 4); + XMVECTORF32 U; + U.v = V; + U.f[i] = *f; + return U.v; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetXPtr(FXMVECTOR V, const float* x) noexcept +{ + assert(x != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + *x, + V.vector4_f32[1], + V.vector4_f32[2], + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(x, V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_load_ss(x); + vResult = _mm_move_ss(V, vResult); + return vResult; +#endif +} + +// Sets the Y component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetYPtr(FXMVECTOR V, const float* y) noexcept +{ + assert(y != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + *y, + V.vector4_f32[2], + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(y, V, 1); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(y); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1)); + return vResult; +#endif +} + +// Sets the Z component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetZPtr(FXMVECTOR V, const float* z) noexcept +{ + assert(z != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + V.vector4_f32[1], + *z, + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(z, V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(z); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2)); + return vResult; +#endif +} + +// Sets the W component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetWPtr(FXMVECTOR V, const float* w) noexcept +{ + assert(w != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + V.vector4_f32[1], + V.vector4_f32[2], + *w + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(w, V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(w); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i) noexcept +{ + assert(i < 4); + _Analysis_assume_(i < 4); + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = x; + return tmp; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + x, + V.vector4_u32[1], + V.vector4_u32[2], + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vsetq_lane_u32(x, vreinterpretq_u32_f32(V), 0)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cvtsi32_si128(static_cast(x)); + XMVECTOR vResult = _mm_move_ss(V, _mm_castsi128_ps(vTemp)); + return vResult; +#endif +} + +// Sets the Y component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + y, + V.vector4_u32[2], + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vsetq_lane_u32(y, vreinterpretq_u32_f32(V), 1)); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i vResult = _mm_castps_si128(V); + vResult = _mm_insert_epi32(vResult, static_cast(y), 1); + return _mm_castsi128_ps(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(static_cast(y)); + // Replace the x component + vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp)); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1)); + return vResult; +#endif +} + +// Sets the Z component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + V.vector4_u32[1], + z, + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vsetq_lane_u32(z, vreinterpretq_u32_f32(V), 2)); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i vResult = _mm_castps_si128(V); + vResult = _mm_insert_epi32(vResult, static_cast(z), 2); + return _mm_castsi128_ps(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(static_cast(z)); + // Replace the x component + vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp)); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2)); + return vResult; +#endif +} + +// Sets the W component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + V.vector4_u32[1], + V.vector4_u32[2], + w + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vsetq_lane_u32(w, vreinterpretq_u32_f32(V), 3)); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i vResult = _mm_castps_si128(V); + vResult = _mm_insert_epi32(vResult, static_cast(w), 3); + return _mm_castsi128_ps(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(static_cast(w)); + // Replace the x component + vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp)); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t* x, size_t i) noexcept +{ + assert(x != nullptr); + assert(i < 4); + _Analysis_assume_(i < 4); + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = *x; + return tmp; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t* x) noexcept +{ + assert(x != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + *x, + V.vector4_u32[1], + V.vector4_u32[2], + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_lane_u32(x, *reinterpret_cast(&V), 0)); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(x)); + XMVECTOR vResult = _mm_move_ss(V, vTemp); + return vResult; +#endif +} + +// Sets the Y component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t* y) noexcept +{ + assert(y != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + *y, + V.vector4_u32[2], + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_lane_u32(y, *reinterpret_cast(&V), 1)); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(y)); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1)); + return vResult; +#endif +} + +// Sets the Z component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t* z) noexcept +{ + assert(z != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + V.vector4_u32[1], + *z, + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_lane_u32(z, *reinterpret_cast(&V), 2)); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(z)); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2)); + return vResult; +#endif +} + +// Sets the W component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t* w) noexcept +{ + assert(w != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + V.vector4_u32[1], + V.vector4_u32[2], + *w + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_lane_u32(w, *reinterpret_cast(&V), 3)); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(w)); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSwizzle +( + FXMVECTOR V, + uint32_t E0, + uint32_t E1, + uint32_t E2, + uint32_t E3 +) noexcept +{ + assert((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4)); + _Analysis_assume_((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4)); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + V.vector4_f32[E0], + V.vector4_f32[E1], + V.vector4_f32[E2], + V.vector4_f32[E3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const uint32_t ControlElement[4] = + { + 0x03020100, // XM_SWIZZLE_X + 0x07060504, // XM_SWIZZLE_Y + 0x0B0A0908, // XM_SWIZZLE_Z + 0x0F0E0D0C, // XM_SWIZZLE_W + }; + + uint8x8x2_t tbl; + tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V)); + tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V)); + + uint32x2_t idx = vcreate_u32(static_cast(ControlElement[E0]) | (static_cast(ControlElement[E1]) << 32)); + const uint8x8_t rL = vtbl2_u8(tbl, vreinterpret_u8_u32(idx)); + + idx = vcreate_u32(static_cast(ControlElement[E2]) | (static_cast(ControlElement[E3]) << 32)); + const uint8x8_t rH = vtbl2_u8(tbl, vreinterpret_u8_u32(idx)); + + return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH)); +#elif defined(_XM_AVX_INTRINSICS_) + unsigned int elem[4] = { E0, E1, E2, E3 }; + __m128i vControl = _mm_loadu_si128(reinterpret_cast(&elem[0])); + return _mm_permutevar_ps(V, vControl); +#else + auto aPtr = reinterpret_cast(&V); + + XMVECTOR Result; + auto pWork = reinterpret_cast(&Result); + + pWork[0] = aPtr[E0]; + pWork[1] = aPtr[E1]; + pWork[2] = aPtr[E2]; + pWork[3] = aPtr[E3]; + + return Result; +#endif +} + +//------------------------------------------------------------------------------ +inline XMVECTOR XM_CALLCONV XMVectorPermute +( + FXMVECTOR V1, + FXMVECTOR V2, + uint32_t PermuteX, + uint32_t PermuteY, + uint32_t PermuteZ, + uint32_t PermuteW +) noexcept +{ + assert(PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7); + _Analysis_assume_(PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7); + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const uint32_t ControlElement[8] = + { + 0x03020100, // XM_PERMUTE_0X + 0x07060504, // XM_PERMUTE_0Y + 0x0B0A0908, // XM_PERMUTE_0Z + 0x0F0E0D0C, // XM_PERMUTE_0W + 0x13121110, // XM_PERMUTE_1X + 0x17161514, // XM_PERMUTE_1Y + 0x1B1A1918, // XM_PERMUTE_1Z + 0x1F1E1D1C, // XM_PERMUTE_1W + }; + + uint8x8x4_t tbl; + tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V1)); + tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V1)); + tbl.val[2] = vreinterpret_u8_f32(vget_low_f32(V2)); + tbl.val[3] = vreinterpret_u8_f32(vget_high_f32(V2)); + + uint32x2_t idx = vcreate_u32(static_cast(ControlElement[PermuteX]) | (static_cast(ControlElement[PermuteY]) << 32)); + const uint8x8_t rL = vtbl4_u8(tbl, vreinterpret_u8_u32(idx)); + + idx = vcreate_u32(static_cast(ControlElement[PermuteZ]) | (static_cast(ControlElement[PermuteW]) << 32)); + const uint8x8_t rH = vtbl4_u8(tbl, vreinterpret_u8_u32(idx)); + + return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH)); +#elif defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORU32 three = { { { 3, 3, 3, 3 } } }; + + XM_ALIGNED_DATA(16) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW }; + __m128i vControl = _mm_load_si128(reinterpret_cast(&elem[0])); + + __m128i vSelect = _mm_cmpgt_epi32(vControl, three); + vControl = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(vControl), three)); + + __m128 shuffled1 = _mm_permutevar_ps(V1, vControl); + __m128 shuffled2 = _mm_permutevar_ps(V2, vControl); + + __m128 masked1 = _mm_andnot_ps(_mm_castsi128_ps(vSelect), shuffled1); + __m128 masked2 = _mm_and_ps(_mm_castsi128_ps(vSelect), shuffled2); + + return _mm_or_ps(masked1, masked2); +#else + + const uint32_t* aPtr[2]; + aPtr[0] = reinterpret_cast(&V1); + aPtr[1] = reinterpret_cast(&V2); + + XMVECTOR Result; + auto pWork = reinterpret_cast(&Result); + + const uint32_t i0 = PermuteX & 3; + const uint32_t vi0 = PermuteX >> 2; + pWork[0] = aPtr[vi0][i0]; + + const uint32_t i1 = PermuteY & 3; + const uint32_t vi1 = PermuteY >> 2; + pWork[1] = aPtr[vi1][i1]; + + const uint32_t i2 = PermuteZ & 3; + const uint32_t vi2 = PermuteZ >> 2; + pWork[2] = aPtr[vi2][i2]; + + const uint32_t i3 = PermuteW & 3; + const uint32_t vi3 = PermuteW >> 2; + pWork[3] = aPtr[vi3][i3]; + + return Result; +#endif +} + +//------------------------------------------------------------------------------ +// Define a control vector to be used in XMVectorSelect +// operations. The four integers specified in XMVectorSelectControl +// serve as indices to select between components in two vectors. +// The first index controls selection for the first component of +// the vectors involved in a select operation, the second index +// controls selection for the second component etc. A value of +// zero for an index causes the corresponding component from the first +// vector to be selected whereas a one causes the component from the +// second vector to be selected instead. + +inline XMVECTOR XM_CALLCONV XMVectorSelectControl +( + uint32_t VectorIndex0, + uint32_t VectorIndex1, + uint32_t VectorIndex2, + uint32_t VectorIndex3 +) noexcept +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + // x=Index0,y=Index1,z=Index2,w=Index3 + __m128i vTemp = _mm_set_epi32(static_cast(VectorIndex3), static_cast(VectorIndex2), static_cast(VectorIndex1), static_cast(VectorIndex0)); + // Any non-zero entries become 0xFFFFFFFF else 0 + vTemp = _mm_cmpgt_epi32(vTemp, g_XMZero); + return _mm_castsi128_ps(vTemp); +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + int32x2_t V0 = vcreate_s32(static_cast(VectorIndex0) | (static_cast(VectorIndex1) << 32)); + int32x2_t V1 = vcreate_s32(static_cast(VectorIndex2) | (static_cast(VectorIndex3) << 32)); + int32x4_t vTemp = vcombine_s32(V0, V1); + // Any non-zero entries become 0xFFFFFFFF else 0 + return vreinterpretq_f32_u32(vcgtq_s32(vTemp, g_XMZero)); +#else + XMVECTOR ControlVector; + const uint32_t ControlElement[] = + { + XM_SELECT_0, + XM_SELECT_1 + }; + + assert(VectorIndex0 < 2); + assert(VectorIndex1 < 2); + assert(VectorIndex2 < 2); + assert(VectorIndex3 < 2); + _Analysis_assume_(VectorIndex0 < 2); + _Analysis_assume_(VectorIndex1 < 2); + _Analysis_assume_(VectorIndex2 < 2); + _Analysis_assume_(VectorIndex3 < 2); + + ControlVector.vector4_u32[0] = ControlElement[VectorIndex0]; + ControlVector.vector4_u32[1] = ControlElement[VectorIndex1]; + ControlVector.vector4_u32[2] = ControlElement[VectorIndex2]; + ControlVector.vector4_u32[3] = ControlElement[VectorIndex3]; + + return ControlVector; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSelect +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Control +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]), + (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]), + (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]), + (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]), + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vbslq_f32(vreinterpretq_u32_f32(Control), V2, V1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = _mm_andnot_ps(Control, V1); + XMVECTOR vTemp2 = _mm_and_ps(V2, Control); + return _mm_or_ps(vTemp1, vTemp2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMergeXY +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[0], + V2.vector4_u32[0], + V1.vector4_u32[1], + V2.vector4_u32[1], + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vzipq_f32(V1, V2).val[0]; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_unpacklo_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMergeZW +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[2], + V2.vector4_u32[2], + V1.vector4_u32[3], + V2.vector4_u32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vzipq_f32(V1, V2).val[1]; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_unpackhi_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) noexcept +{ + assert(Elements < 4); + _Analysis_assume_(Elements < 4); + return XMVectorPermute(V1, V2, Elements, ((Elements)+1), ((Elements)+2), ((Elements)+3)); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) noexcept +{ + assert(Elements < 4); + _Analysis_assume_(Elements < 4); + return XMVectorSwizzle(V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) noexcept +{ + assert(Elements < 4); + _Analysis_assume_(Elements < 4); + return XMVectorSwizzle(V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorInsert( + FXMVECTOR VD, FXMVECTOR VS, + uint32_t VSLeftRotateElements, + uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3) noexcept +{ + XMVECTOR Control = XMVectorSelectControl(Select0 & 1, Select1 & 1, Select2 & 1, Select3 & 1); + return XMVectorSelect(VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control); +} + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0, + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vceqq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpeq_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorEqualR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + assert(pCR != nullptr); +#if defined(_XM_NO_INTRINSICS_) + uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux & uy & uz & uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux | uy | uz | uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTORU32 Control = { { { ux, uy, uz, uw } } }; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vreinterpret_u8_u32(vget_low_u32(vResult)), vreinterpret_u8_u32(vget_high_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + // All elements are equal + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + // All elements are not equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vreinterpretq_f32_u32(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest == 0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +// Treat the components of the vectors as unsigned integers and +// compare individual bits between the two. This is useful for +// comparing control vectors and result vectors returned from +// other comparison operations. + +inline XMVECTOR XM_CALLCONV XMVectorEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0, + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vceqq_s32(vreinterpretq_s32_f32(V1), vreinterpretq_s32_f32(V2))); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorEqualIntR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + assert(pCR != nullptr); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control = XMVectorEqualInt(V1, V2); + + *pCR = 0; + if (XMVector4EqualInt(Control, XMVectorTrueInt())) + { + // All elements are equal + *pCR |= XM_CRMASK_CR6TRUE; + } + else if (XMVector4EqualInt(Control, XMVectorFalseInt())) + { + // All elements are not equal + *pCR |= XM_CRMASK_CR6FALSE; + } + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + // All elements are equal + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + // All elements are not equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vreinterpretq_f32_u32(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + int iTemp = _mm_movemask_ps(_mm_castsi128_ps(V)); + uint32_t CR = 0; + if (iTemp == 0x0F) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTemp) + { + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float fDeltax = V1.vector4_f32[0] - V2.vector4_f32[0]; + float fDeltay = V1.vector4_f32[1] - V2.vector4_f32[1]; + float fDeltaz = V1.vector4_f32[2] - V2.vector4_f32[2]; + float fDeltaw = V1.vector4_f32[3] - V2.vector4_f32[3]; + + fDeltax = fabsf(fDeltax); + fDeltay = fabsf(fDeltay); + fDeltaz = fabsf(fDeltaz); + fDeltaw = fabsf(fDeltaw); + + XMVECTORU32 Control = { { { + (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0, + (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0, + (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0, + (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0, + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vDelta = vsubq_f32(V1, V2); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + return vacleq_f32(vDelta, Epsilon); +#else + return vreinterpretq_f32_u32(vcleq_f32(vabsq_f32(vDelta), Epsilon)); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1, V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp, vDelta); + vTemp = _mm_max_ps(vTemp, vDelta); + vTemp = _mm_cmple_ps(vTemp, Epsilon); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0, + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(V1, V2))); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpneq_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0, + (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0, + (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0, + (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vmvnq_u32( + vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)))); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return _mm_xor_ps(_mm_castsi128_ps(V), g_XMNegOneMask); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorGreater +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vcgtq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpgt_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorGreaterR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + assert(pCR != nullptr); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux & uy & uz & uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux | uy | uz | uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTORU32 Control = { { { ux, uy, uz, uw } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vreinterpretq_f32_u32(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest == 0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vcgeq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpge_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + assert(pCR != nullptr); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux & uy & uz & uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux | uy | uz | uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTORU32 Control = { { { ux, uy, uz, uw } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + // All elements are greater or equal + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + // All elements are not greater or equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vreinterpretq_f32_u32(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest == 0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLess +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vcltq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmplt_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vcleq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmple_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorInBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t vTemp1 = vcleq_f32(V, Bounds); + // Negate the bounds + uint32x4_t vTemp2 = vreinterpretq_u32_f32(vnegq_f32(Bounds)); + // Test if greater or equal (Reversed) + vTemp2 = vcleq_f32(vreinterpretq_f32_u32(vTemp2), V); + // Blend answers + vTemp1 = vandq_u32(vTemp1, vTemp2); + return vreinterpretq_f32_u32(vTemp1); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2, V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + return vTemp1; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorInBoundsR +( + uint32_t* pCR, + FXMVECTOR V, + FXMVECTOR Bounds +) noexcept +{ + assert(pCR != nullptr); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + + uint32_t CR = 0; + if (ux & uy & uz & uw) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + + XMVECTORU32 Control = { { { ux, uy, uz, uw } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t vTemp1 = vcleq_f32(V, Bounds); + // Negate the bounds + uint32x4_t vTemp2 = vreinterpretq_u32_f32(vnegq_f32(Bounds)); + // Test if greater or equal (Reversed) + vTemp2 = vcleq_f32(vreinterpretq_f32_u32(vTemp2), V); + // Blend answers + vTemp1 = vandq_u32(vTemp1, vTemp2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTemp1)), vget_high_u8(vreinterpretq_u8_u32(vTemp1))); + uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1); + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + return vreinterpretq_f32_u32(vTemp1); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2, V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + + uint32_t CR = 0; + if (_mm_movemask_ps(vTemp1) == 0xf) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + return vTemp1; +#endif +} + +//------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline XMVECTOR XM_CALLCONV XMVectorIsNaN(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0, + XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0, + XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0, + XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(__clang__) && defined(__FINITE_MATH_ONLY__) + XMVECTORU32 vResult = { { { + isnan(vgetq_lane_f32(V, 0)) ? 0xFFFFFFFFU : 0, + isnan(vgetq_lane_f32(V, 1)) ? 0xFFFFFFFFU : 0, + isnan(vgetq_lane_f32(V, 2)) ? 0xFFFFFFFFU : 0, + isnan(vgetq_lane_f32(V, 3)) ? 0xFFFFFFFFU : 0 } } }; + return vResult.v; +#else +// Test against itself. NaN is always not equal + uint32x4_t vTempNan = vceqq_f32(V, V); + // Flip results + return vreinterpretq_f32_u32(vmvnq_u32(vTempNan)); +#endif +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(__clang__) && defined(__FINITE_MATH_ONLY__) + XM_ALIGNED_DATA(16) float tmp[4]; + _mm_store_ps(tmp, V); + XMVECTORU32 vResult = { { { + isnan(tmp[0]) ? 0xFFFFFFFFU : 0, + isnan(tmp[1]) ? 0xFFFFFFFFU : 0, + isnan(tmp[2]) ? 0xFFFFFFFFU : 0, + isnan(tmp[3]) ? 0xFFFFFFFFU : 0 } } }; + return vResult.v; +#else +// Test against itself. NaN is always not equal + return _mm_cmpneq_ps(V, V); +#endif +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorIsInfinite(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0, + XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0, + XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0, + XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x4_t vTemp = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + // Compare to infinity + vTemp = vceqq_f32(vreinterpretq_f32_u32(vTemp), g_XMInfinity); + // If any are infinity, the signs are true. + return vreinterpretq_f32_u32(vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V, g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity); + // If any are infinity, the signs are true. + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +// Rounding and clamping operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMin +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0], + (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1], + (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2], + (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vminq_f32(V1, V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_min_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMax +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0], + (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1], + (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2], + (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmaxq_f32(V1, V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_max_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +namespace MathInternal +{ + // Round to nearest (even) a.k.a. banker's rounding + inline float round_to_nearest(float x) noexcept + { + float i = floorf(x); + x -= i; + if (x < 0.5f) + return i; + if (x > 0.5f) + return i + 1.f; + + float int_part; + (void)modff(i / 2.f, &int_part); + if ((2.f * int_part) == i) + { + return i; + } + + return i + 1.f; + } +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline XMVECTOR XM_CALLCONV XMVectorRound(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + MathInternal::round_to_nearest(V.vector4_f32[0]), + MathInternal::round_to_nearest(V.vector4_f32[1]), + MathInternal::round_to_nearest(V.vector4_f32[2]), + MathInternal::round_to_nearest(V.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + return vrndnq_f32(V); +#else + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(V), g_XMNegativeZero); + float32x4_t sMagic = vreinterpretq_f32_u32(vorrq_u32(g_XMNoFraction, sign)); + float32x4_t R1 = vaddq_f32(V, sMagic); + R1 = vsubq_f32(R1, sMagic); + float32x4_t R2 = vabsq_f32(V); + uint32x4_t mask = vcleq_f32(R2, g_XMNoFraction); + return vbslq_f32(mask, R1, V); +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_round_ps(V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 sign = _mm_and_ps(V, g_XMNegativeZero); + __m128 sMagic = _mm_or_ps(g_XMNoFraction, sign); + __m128 R1 = _mm_add_ps(V, sMagic); + R1 = _mm_sub_ps(R1, sMagic); + __m128 R2 = _mm_and_ps(V, g_XMAbsMask); + __m128 mask = _mm_cmple_ps(R2, g_XMNoFraction); + R2 = _mm_andnot_ps(mask, V); + R1 = _mm_and_ps(R1, mask); + XMVECTOR vResult = _mm_xor_ps(R1, R2); + return vResult; +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTruncate(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + uint32_t i; + + // Avoid C4701 + Result.vector4_f32[0] = 0.0f; + + for (i = 0; i < 4; i++) + { + if (XMISNAN(V.vector4_f32[i])) + { + Result.vector4_u32[i] = 0x7FC00000; + } + else if (fabsf(V.vector4_f32[i]) < 8388608.0f) + { + Result.vector4_f32[i] = static_cast(static_cast(V.vector4_f32[i])); + } + else + { + Result.vector4_f32[i] = V.vector4_f32[i]; + } + } + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + return vrndq_f32(V); +#else + float32x4_t vTest = vabsq_f32(V); + vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction)); + + int32x4_t vInt = vcvtq_s32_f32(V); + float32x4_t vResult = vcvtq_f32_s32(vInt); + + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V); +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_round_ps(V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + // Get the abs value + __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF + vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction); + // Convert to int and back to float for rounding with truncation + __m128i vInt = _mm_cvttps_epi32(V); + // Convert back to floats + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest)); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V)); + vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorFloor(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + floorf(V.vector4_f32[0]), + floorf(V.vector4_f32[1]), + floorf(V.vector4_f32[2]), + floorf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + return vrndmq_f32(V); +#else + float32x4_t vTest = vabsq_f32(V); + vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction)); + // Truncate + int32x4_t vInt = vcvtq_s32_f32(V); + float32x4_t vResult = vcvtq_f32_s32(vInt); + uint32x4_t vLargerMask = vcgtq_f32(vResult, V); + // 0 -> 0, 0xffffffff -> -1.0f + float32x4_t vLarger = vcvtq_f32_s32(vreinterpretq_s32_u32(vLargerMask)); + vResult = vaddq_f32(vResult, vLarger); + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V); +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_floor_ps(V); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction); + // Truncate + __m128i vInt = _mm_cvttps_epi32(V); + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + __m128 vLarger = _mm_cmpgt_ps(vResult, V); + // 0 -> 0, 0xffffffff -> -1.0f + vLarger = _mm_cvtepi32_ps(_mm_castps_si128(vLarger)); + vResult = _mm_add_ps(vResult, vLarger); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest)); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V)); + vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCeiling(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + ceilf(V.vector4_f32[0]), + ceilf(V.vector4_f32[1]), + ceilf(V.vector4_f32[2]), + ceilf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + return vrndpq_f32(V); +#else + float32x4_t vTest = vabsq_f32(V); + vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction)); + // Truncate + int32x4_t vInt = vcvtq_s32_f32(V); + float32x4_t vResult = vcvtq_f32_s32(vInt); + uint32x4_t vSmallerMask = vcltq_f32(vResult, V); + // 0 -> 0, 0xffffffff -> -1.0f + float32x4_t vSmaller = vcvtq_f32_s32(vreinterpretq_s32_u32(vSmallerMask)); + vResult = vsubq_f32(vResult, vSmaller); + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V); +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_ceil_ps(V); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction); + // Truncate + __m128i vInt = _mm_cvttps_epi32(V); + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + __m128 vSmaller = _mm_cmplt_ps(vResult, V); + // 0 -> 0, 0xffffffff -> -1.0f + vSmaller = _mm_cvtepi32_ps(_mm_castps_si128(vSmaller)); + vResult = _mm_sub_ps(vResult, vSmaller); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest)); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V)); + vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorClamp +( + FXMVECTOR V, + FXMVECTOR Min, + FXMVECTOR Max +) noexcept +{ + assert(XMVector4LessOrEqual(Min, Max)); + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVectorMax(Min, V); + Result = XMVectorMin(Max, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32(Min, V); + vResult = vminq_f32(Max, vResult); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult; + vResult = _mm_max_ps(Min, V); + vResult = _mm_min_ps(Max, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSaturate(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + return XMVectorClamp(V, Zero, g_XMOne.v); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Set <0 to 0 + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0)); + // Set>1 to 1 + return vminq_f32(vResult, vdupq_n_f32(1.0f)); +#elif defined(_XM_SSE_INTRINSICS_) + // Set <0 to 0 + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + // Set>1 to 1 + return _mm_min_ps(vResult, g_XMOne); +#endif +} + +//------------------------------------------------------------------------------ +// Bitwise logical operations +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAndInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[0] & V2.vector4_u32[0], + V1.vector4_u32[1] & V2.vector4_u32[1], + V1.vector4_u32[2] & V2.vector4_u32[2], + V1.vector4_u32[3] & V2.vector4_u32[3] + } } }; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2))); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_and_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAndCInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[0] & ~V2.vector4_u32[0], + V1.vector4_u32[1] & ~V2.vector4_u32[1], + V1.vector4_u32[2] & ~V2.vector4_u32[2], + V1.vector4_u32[3] & ~V2.vector4_u32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2))); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_andnot_si128(_mm_castps_si128(V2), _mm_castps_si128(V1)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorOrInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[0] | V2.vector4_u32[0], + V1.vector4_u32[1] | V2.vector4_u32[1], + V1.vector4_u32[2] | V2.vector4_u32[2], + V1.vector4_u32[3] | V2.vector4_u32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2))); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_or_si128(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNorInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + ~(V1.vector4_u32[0] | V2.vector4_u32[0]), + ~(V1.vector4_u32[1] | V2.vector4_u32[1]), + ~(V1.vector4_u32[2] | V2.vector4_u32[2]), + ~(V1.vector4_u32[3] | V2.vector4_u32[3]) + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t Result = vorrq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + return vreinterpretq_f32_u32(vbicq_u32(g_XMNegOneMask, Result)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i Result; + Result = _mm_or_si128(_mm_castps_si128(V1), _mm_castps_si128(V2)); + Result = _mm_andnot_si128(Result, g_XMNegOneMask); + return _mm_castsi128_ps(Result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorXorInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[0] ^ V2.vector4_u32[0], + V1.vector4_u32[1] ^ V2.vector4_u32[1], + V1.vector4_u32[2] ^ V2.vector4_u32[2], + V1.vector4_u32[3] ^ V2.vector4_u32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2))); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_xor_si128(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNegate(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + -V.vector4_f32[0], + -V.vector4_f32[1], + -V.vector4_f32[2], + -V.vector4_f32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vnegq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Z; + + Z = _mm_setzero_ps(); + + return _mm_sub_ps(Z, V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAdd +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + V1.vector4_f32[0] + V2.vector4_f32[0], + V1.vector4_f32[1] + V2.vector4_f32[1], + V1.vector4_f32[2] + V2.vector4_f32[2], + V1.vector4_f32[3] + V2.vector4_f32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vaddq_f32(V1, V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_add_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSum(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result; + Result.f[0] = + Result.f[1] = + Result.f[2] = + Result.f[3] = V.vector4_f32[0] + V.vector4_f32[1] + V.vector4_f32[2] + V.vector4_f32[3]; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + float32x4_t vTemp = vpaddq_f32(V, V); + return vpaddq_f32(vTemp, vTemp); +#else + float32x2_t v1 = vget_low_f32(V); + float32x2_t v2 = vget_high_f32(V); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + return vcombine_f32(v1, v1); +#endif +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vTemp = _mm_hadd_ps(V, V); + return _mm_hadd_ps(vTemp, vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 3, 0, 1)); + XMVECTOR vTemp2 = _mm_add_ps(V, vTemp); + vTemp = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 0, 3, 2)); + return _mm_add_ps(vTemp, vTemp2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAddAngles +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + // Add the given angles together. If the range of V1 is such + // that -Pi <= V1 < Pi and the range of V2 is such that + // -2Pi <= V2 <= 2Pi, then the range of the resulting angle + // will be -Pi <= Result < Pi. + XMVECTOR Result = XMVectorAdd(V1, V2); + + XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); + XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); + + Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); + Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); + + Result = XMVectorAdd(Result, Offset); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Adjust the angles + float32x4_t vResult = vaddq_f32(V1, V2); + // Less than Pi? + uint32x4_t vOffset = vcltq_f32(vResult, g_XMNegativePi); + vOffset = vandq_u32(vOffset, g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = vaddq_f32(vResult, vreinterpretq_f32_u32(vOffset)); + // Greater than or equal to Pi? + vOffset = vcgeq_f32(vResult, g_XMPi); + vOffset = vandq_u32(vOffset, g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = vsubq_f32(vResult, vreinterpretq_f32_u32(vOffset)); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = _mm_add_ps(V1, V2); + // Less than Pi? + XMVECTOR vOffset = _mm_cmplt_ps(vResult, g_XMNegativePi); + vOffset = _mm_and_ps(vOffset, g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = _mm_add_ps(vResult, vOffset); + // Greater than or equal to Pi? + vOffset = _mm_cmpge_ps(vResult, g_XMPi); + vOffset = _mm_and_ps(vOffset, g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = _mm_sub_ps(vResult, vOffset); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSubtract +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + V1.vector4_f32[0] - V2.vector4_f32[0], + V1.vector4_f32[1] - V2.vector4_f32[1], + V1.vector4_f32[2] - V2.vector4_f32[2], + V1.vector4_f32[3] - V2.vector4_f32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsubq_f32(V1, V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sub_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSubtractAngles +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + // Subtract the given angles. If the range of V1 is such + // that -Pi <= V1 < Pi and the range of V2 is such that + // -2Pi <= V2 <= 2Pi, then the range of the resulting angle + // will be -Pi <= Result < Pi. + XMVECTOR Result = XMVectorSubtract(V1, V2); + + XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); + XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); + + Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); + Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); + + Result = XMVectorAdd(Result, Offset); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = vsubq_f32(V1, V2); + // Less than Pi? + uint32x4_t vOffset = vcltq_f32(vResult, g_XMNegativePi); + vOffset = vandq_u32(vOffset, g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = vaddq_f32(vResult, vreinterpretq_f32_u32(vOffset)); + // Greater than or equal to Pi? + vOffset = vcgeq_f32(vResult, g_XMPi); + vOffset = vandq_u32(vOffset, g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = vsubq_f32(vResult, vreinterpretq_f32_u32(vOffset)); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = _mm_sub_ps(V1, V2); + // Less than Pi? + XMVECTOR vOffset = _mm_cmplt_ps(vResult, g_XMNegativePi); + vOffset = _mm_and_ps(vOffset, g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = _mm_add_ps(vResult, vOffset); + // Greater than or equal to Pi? + vOffset = _mm_cmpge_ps(vResult, g_XMPi); + vOffset = _mm_and_ps(vOffset, g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = _mm_sub_ps(vResult, vOffset); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMultiply +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + V1.vector4_f32[0] * V2.vector4_f32[0], + V1.vector4_f32[1] * V2.vector4_f32[1], + V1.vector4_f32[2] * V2.vector4_f32[2], + V1.vector4_f32[3] * V2.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmulq_f32(V1, V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_mul_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + V1.vector4_f32[0] * V2.vector4_f32[0] + V3.vector4_f32[0], + V1.vector4_f32[1] * V2.vector4_f32[1] + V3.vector4_f32[1], + V1.vector4_f32[2] * V2.vector4_f32[2] + V3.vector4_f32[2], + V1.vector4_f32[3] * V2.vector4_f32[3] + V3.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + return vfmaq_f32(V3, V1, V2); +#else + return vmlaq_f32(V3, V1, V2); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return XM_FMADD_PS(V1, V2, V3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorDivide +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + V1.vector4_f32[0] / V2.vector4_f32[0], + V1.vector4_f32[1] / V2.vector4_f32[1], + V1.vector4_f32[2] / V2.vector4_f32[2], + V1.vector4_f32[3] / V2.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + return vdivq_f32(V1, V2); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(V2); + float32x4_t S = vrecpsq_f32(Reciprocal, V2); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, V2); + Reciprocal = vmulq_f32(S, Reciprocal); + return vmulq_f32(V1, Reciprocal); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_div_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]), + V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]), + V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]), + V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3]) + } } }; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + return vfmsq_f32(V3, V1, V2); +#else + return vmlsq_f32(V3, V1, V2); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return XM_FNMADD_PS(V1, V2, V3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorScale +( + FXMVECTOR V, + float ScaleFactor +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + V.vector4_f32[0] * ScaleFactor, + V.vector4_f32[1] * ScaleFactor, + V.vector4_f32[2] * ScaleFactor, + V.vector4_f32[3] * ScaleFactor + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmulq_n_f32(V, ScaleFactor); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_set_ps1(ScaleFactor); + return _mm_mul_ps(vResult, V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocalEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + 1.f / V.vector4_f32[0], + 1.f / V.vector4_f32[1], + 1.f / V.vector4_f32[2], + 1.f / V.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vrecpeq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_rcp_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocal(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + 1.f / V.vector4_f32[0], + 1.f / V.vector4_f32[1], + 1.f / V.vector4_f32[2], + 1.f / V.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + float32x4_t one = vdupq_n_f32(1.0f); + return vdivq_f32(one, V); +#else + // 2 iterations of Newton-Raphson refinement + float32x4_t Reciprocal = vrecpeq_f32(V); + float32x4_t S = vrecpsq_f32(Reciprocal, V); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, V); + return vmulq_f32(S, Reciprocal); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_div_ps(g_XMOne, V); +#endif +} + +//------------------------------------------------------------------------------ +// Return an estimated square root +inline XMVECTOR XM_CALLCONV XMVectorSqrtEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + sqrtf(V.vector4_f32[0]), + sqrtf(V.vector4_f32[1]), + sqrtf(V.vector4_f32[2]), + sqrtf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 1 iteration of Newton-Raphson refinment of sqrt + float32x4_t S0 = vrsqrteq_f32(V); + float32x4_t P0 = vmulq_f32(V, S0); + float32x4_t R0 = vrsqrtsq_f32(P0, S0); + float32x4_t S1 = vmulq_f32(S0, R0); + + XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); + XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0)); + XMVECTOR Result = vmulq_f32(V, S1); + XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); + return XMVectorSelect(V, Result, Select); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sqrt_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSqrt(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + sqrtf(V.vector4_f32[0]), + sqrtf(V.vector4_f32[1]), + sqrtf(V.vector4_f32[2]), + sqrtf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 3 iterations of Newton-Raphson refinment of sqrt + float32x4_t S0 = vrsqrteq_f32(V); + float32x4_t P0 = vmulq_f32(V, S0); + float32x4_t R0 = vrsqrtsq_f32(P0, S0); + float32x4_t S1 = vmulq_f32(S0, R0); + float32x4_t P1 = vmulq_f32(V, S1); + float32x4_t R1 = vrsqrtsq_f32(P1, S1); + float32x4_t S2 = vmulq_f32(S1, R1); + float32x4_t P2 = vmulq_f32(V, S2); + float32x4_t R2 = vrsqrtsq_f32(P2, S2); + float32x4_t S3 = vmulq_f32(S2, R2); + + XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); + XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0)); + XMVECTOR Result = vmulq_f32(V, S3); + XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); + return XMVectorSelect(V, Result, Select); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sqrt_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + 1.f / sqrtf(V.vector4_f32[0]), + 1.f / sqrtf(V.vector4_f32[1]), + 1.f / sqrtf(V.vector4_f32[2]), + 1.f / sqrtf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vrsqrteq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_rsqrt_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + 1.f / sqrtf(V.vector4_f32[0]), + 1.f / sqrtf(V.vector4_f32[1]), + 1.f / sqrtf(V.vector4_f32[2]), + 1.f / sqrtf(V.vector4_f32[3]) + } } }; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t S0 = vrsqrteq_f32(V); + + float32x4_t P0 = vmulq_f32(V, S0); + float32x4_t R0 = vrsqrtsq_f32(P0, S0); + + float32x4_t S1 = vmulq_f32(S0, R0); + float32x4_t P1 = vmulq_f32(V, S1); + float32x4_t R1 = vrsqrtsq_f32(P1, S1); + + return vmulq_f32(S1, R1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_sqrt_ps(V); + vResult = _mm_div_ps(g_XMOne, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExp2(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + exp2f(V.vector4_f32[0]), + exp2f(V.vector4_f32[1]), + exp2f(V.vector4_f32[2]), + exp2f(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t itrunc = vcvtq_s32_f32(V); + float32x4_t ftrunc = vcvtq_f32_s32(itrunc); + float32x4_t y = vsubq_f32(V, ftrunc); + + float32x4_t poly = vmlaq_f32(g_XMExpEst6, g_XMExpEst7, y); + poly = vmlaq_f32(g_XMExpEst5, poly, y); + poly = vmlaq_f32(g_XMExpEst4, poly, y); + poly = vmlaq_f32(g_XMExpEst3, poly, y); + poly = vmlaq_f32(g_XMExpEst2, poly, y); + poly = vmlaq_f32(g_XMExpEst1, poly, y); + poly = vmlaq_f32(g_XMOne, poly, y); + + int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias); + biased = vshlq_n_s32(biased, 23); + float32x4_t result0 = XMVectorDivide(vreinterpretq_f32_s32(biased), poly); + + biased = vaddq_s32(itrunc, g_XM253); + biased = vshlq_n_s32(biased, 23); + float32x4_t result1 = XMVectorDivide(vreinterpretq_f32_s32(biased), poly); + result1 = vmulq_f32(g_XMMinNormal.v, result1); + + // Use selection to handle the cases + // if (V is NaN) -> QNaN; + // else if (V sign bit set) + // if (V > -150) + // if (V.exponent < -126) -> result1 + // else -> result0 + // else -> +0 + // else + // if (V < 128) -> result0 + // else -> +inf + + uint32x4_t comp = vcltq_s32(vreinterpretq_s32_f32(V), g_XMBin128); + float32x4_t result2 = vbslq_f32(comp, result0, g_XMInfinity); + + comp = vcltq_s32(itrunc, g_XMSubnormalExponent); + float32x4_t result3 = vbslq_f32(comp, result1, result0); + + comp = vcltq_s32(vreinterpretq_s32_f32(V), g_XMBinNeg150); + float32x4_t result4 = vbslq_f32(comp, result3, g_XMZero); + + int32x4_t sign = vandq_s32(vreinterpretq_s32_f32(V), g_XMNegativeZero); + comp = vceqq_s32(sign, g_XMNegativeZero); + float32x4_t result5 = vbslq_f32(comp, result4, result2); + + int32x4_t t0 = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest); + int32x4_t t1 = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + t0 = vreinterpretq_s32_u32(vceqq_s32(t0, g_XMZero)); + t1 = vreinterpretq_s32_u32(vceqq_s32(t1, g_XMInfinity)); + int32x4_t isNaN = vbicq_s32(t1, t0); + + float32x4_t vResult = vbslq_f32(vreinterpretq_u32_s32(isNaN), g_XMQNaN, result5); + return vResult; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_exp2_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i itrunc = _mm_cvttps_epi32(V); + __m128 ftrunc = _mm_cvtepi32_ps(itrunc); + __m128 y = _mm_sub_ps(V, ftrunc); + + __m128 poly = XM_FMADD_PS(g_XMExpEst7, y, g_XMExpEst6); + poly = XM_FMADD_PS(poly, y, g_XMExpEst5); + poly = XM_FMADD_PS(poly, y, g_XMExpEst4); + poly = XM_FMADD_PS(poly, y, g_XMExpEst3); + poly = XM_FMADD_PS(poly, y, g_XMExpEst2); + poly = XM_FMADD_PS(poly, y, g_XMExpEst1); + poly = XM_FMADD_PS(poly, y, g_XMOne); + + __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias); + biased = _mm_slli_epi32(biased, 23); + __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly); + + biased = _mm_add_epi32(itrunc, g_XM253); + biased = _mm_slli_epi32(biased, 23); + __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly); + result1 = _mm_mul_ps(g_XMMinNormal.v, result1); + + // Use selection to handle the cases + // if (V is NaN) -> QNaN; + // else if (V sign bit set) + // if (V > -150) + // if (V.exponent < -126) -> result1 + // else -> result0 + // else -> +0 + // else + // if (V < 128) -> result0 + // else -> +inf + + __m128i comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBin128); + __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0)); + __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity); + __m128i result2 = _mm_or_si128(select0, select1); + + comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent); + select1 = _mm_and_si128(comp, _mm_castps_si128(result1)); + select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0)); + __m128i result3 = _mm_or_si128(select0, select1); + + comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBinNeg150); + select0 = _mm_and_si128(comp, result3); + select1 = _mm_andnot_si128(comp, g_XMZero); + __m128i result4 = _mm_or_si128(select0, select1); + + __m128i sign = _mm_and_si128(_mm_castps_si128(V), g_XMNegativeZero); + comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero); + select0 = _mm_and_si128(comp, result4); + select1 = _mm_andnot_si128(comp, result2); + __m128i result5 = _mm_or_si128(select0, select1); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result5); + __m128i vResult = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(vResult); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExp10(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + powf(10.0f, V.vector4_f32[0]), + powf(10.0f, V.vector4_f32[1]), + powf(10.0f, V.vector4_f32[2]), + powf(10.0f, V.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_exp10_ps(V); + return Result; +#else + // exp10(V) = exp2(vin*log2(10)) + XMVECTOR Vten = XMVectorMultiply(g_XMLg10, V); + return XMVectorExp2(Vten); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExpE(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + expf(V.vector4_f32[0]), + expf(V.vector4_f32[1]), + expf(V.vector4_f32[2]), + expf(V.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_exp_ps(V); + return Result; +#else + // expE(V) = exp2(vin*log2(e)) + XMVECTOR Ve = XMVectorMultiply(g_XMLgE, V); + return XMVectorExp2(Ve); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExp(FXMVECTOR V) noexcept +{ + return XMVectorExp2(V); +} + +//------------------------------------------------------------------------------ + +#if defined(_XM_SSE_INTRINSICS_) + +namespace MathInternal +{ + inline __m128i multi_sll_epi32(__m128i value, __m128i count) noexcept + { + __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0, 0, 0, 0)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r0 = _mm_sll_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 1, 1, 1)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r1 = _mm_sll_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 2, 2, 2)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 2, 2, 2)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r2 = _mm_sll_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3, 3, 3, 3)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3, 3, 3, 3)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r3 = _mm_sll_epi32(v, c); + + // (r0,r0,r1,r1) + __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0, 0, 0, 0)); + // (r2,r2,r3,r3) + __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0, 0, 0, 0)); + // (r0,r1,r2,r3) + __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2, 0, 2, 0)); + return _mm_castps_si128(result); + } + + inline __m128i multi_srl_epi32(__m128i value, __m128i count) noexcept + { + __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0, 0, 0, 0)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r0 = _mm_srl_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 1, 1, 1)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r1 = _mm_srl_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 2, 2, 2)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 2, 2, 2)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r2 = _mm_srl_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3, 3, 3, 3)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3, 3, 3, 3)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r3 = _mm_srl_epi32(v, c); + + // (r0,r0,r1,r1) + __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0, 0, 0, 0)); + // (r2,r2,r3,r3) + __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0, 0, 0, 0)); + // (r0,r1,r2,r3) + __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2, 0, 2, 0)); + return _mm_castps_si128(result); + } + + inline __m128i GetLeadingBit(const __m128i value) noexcept + { + static const XMVECTORI32 g_XM0000FFFF = { { { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF } } }; + static const XMVECTORI32 g_XM000000FF = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } }; + static const XMVECTORI32 g_XM0000000F = { { { 0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F } } }; + static const XMVECTORI32 g_XM00000003 = { { { 0x00000003, 0x00000003, 0x00000003, 0x00000003 } } }; + + __m128i v = value, r, c, b, s; + + c = _mm_cmpgt_epi32(v, g_XM0000FFFF); // c = (v > 0xFFFF) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + r = _mm_slli_epi32(b, 4); // r = (b << 4) + v = multi_srl_epi32(v, r); // v = (v >> r) + + c = _mm_cmpgt_epi32(v, g_XM000000FF); // c = (v > 0xFF) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + s = _mm_slli_epi32(b, 3); // s = (b << 3) + v = multi_srl_epi32(v, s); // v = (v >> s) + r = _mm_or_si128(r, s); // r = (r | s) + + c = _mm_cmpgt_epi32(v, g_XM0000000F); // c = (v > 0xF) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + s = _mm_slli_epi32(b, 2); // s = (b << 2) + v = multi_srl_epi32(v, s); // v = (v >> s) + r = _mm_or_si128(r, s); // r = (r | s) + + c = _mm_cmpgt_epi32(v, g_XM00000003); // c = (v > 0x3) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + s = _mm_slli_epi32(b, 1); // s = (b << 1) + v = multi_srl_epi32(v, s); // v = (v >> s) + r = _mm_or_si128(r, s); // r = (r | s) + + s = _mm_srli_epi32(v, 1); + r = _mm_or_si128(r, s); + return r; + } +} // namespace MathInternal + +#endif // _XM_SSE_INTRINSICS_ + +#if defined(_XM_ARM_NEON_INTRINSICS_) + +namespace MathInternal +{ + inline int32x4_t GetLeadingBit(const int32x4_t value) noexcept + { + static const XMVECTORI32 g_XM0000FFFF = { { { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF } } }; + static const XMVECTORI32 g_XM000000FF = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } }; + static const XMVECTORI32 g_XM0000000F = { { { 0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F } } }; + static const XMVECTORI32 g_XM00000003 = { { { 0x00000003, 0x00000003, 0x00000003, 0x00000003 } } }; + + uint32x4_t c = vcgtq_s32(value, g_XM0000FFFF); // c = (v > 0xFFFF) + int32x4_t b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31); // b = (c ? 1 : 0) + int32x4_t r = vshlq_n_s32(b, 4); // r = (b << 4) + r = vnegq_s32(r); + int32x4_t v = vshlq_s32(value, r); // v = (v >> r) + + c = vcgtq_s32(v, g_XM000000FF); // c = (v > 0xFF) + b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31); // b = (c ? 1 : 0) + int32x4_t s = vshlq_n_s32(b, 3); // s = (b << 3) + s = vnegq_s32(s); + v = vshlq_s32(v, s); // v = (v >> s) + r = vorrq_s32(r, s); // r = (r | s) + + c = vcgtq_s32(v, g_XM0000000F); // c = (v > 0xF) + b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31); // b = (c ? 1 : 0) + s = vshlq_n_s32(b, 2); // s = (b << 2) + s = vnegq_s32(s); + v = vshlq_s32(v, s); // v = (v >> s) + r = vorrq_s32(r, s); // r = (r | s) + + c = vcgtq_s32(v, g_XM00000003); // c = (v > 0x3) + b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31); // b = (c ? 1 : 0) + s = vshlq_n_s32(b, 1); // s = (b << 1) + s = vnegq_s32(s); + v = vshlq_s32(v, s); // v = (v >> s) + r = vorrq_s32(r, s); // r = (r | s) + + s = vshrq_n_s32(v, 1); + r = vorrq_s32(r, s); + return r; + } + +} // namespace MathInternal + +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLog2(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + log2f(V.vector4_f32[0]), + log2f(V.vector4_f32[1]), + log2f(V.vector4_f32[2]), + log2f(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest); + uint32x4_t isExponentZero = vceqq_s32(vreinterpretq_s32_f32(g_XMZero), rawBiased); + + // Compute exponent and significand for normals. + int32x4_t biased = vshrq_n_s32(rawBiased, 23); + int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias); + int32x4_t trailingNor = trailing; + + // Compute exponent and significand for subnormals. + int32x4_t leading = MathInternal::GetLeadingBit(trailing); + int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading); + int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift); + int32x4_t trailingSub = vshlq_s32(trailing, shift); + trailingSub = vandq_s32(trailingSub, g_XMQNaNTest); + int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor); + int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor); + + // Compute the approximation. + int32x4_t tmp = vorrq_s32(vreinterpretq_s32_f32(g_XMOne), t); + float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne); + + float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y); + log2 = vmlaq_f32(g_XMLogEst5, log2, y); + log2 = vmlaq_f32(g_XMLogEst4, log2, y); + log2 = vmlaq_f32(g_XMLogEst3, log2, y); + log2 = vmlaq_f32(g_XMLogEst2, log2, y); + log2 = vmlaq_f32(g_XMLogEst1, log2, y); + log2 = vmlaq_f32(g_XMLogEst0, log2, y); + log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isInfinite = vceqq_u32(isInfinite, g_XMInfinity); + + uint32x4_t isGreaterZero = vcgtq_f32(V, g_XMZero); + uint32x4_t isNotFinite = vcgtq_f32(V, g_XMInfinity); + uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite); + + uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isZero = vceqq_u32(isZero, g_XMZero); + + uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest); + uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity); + t0 = vceqq_u32(t0, g_XMZero); + t1 = vceqq_u32(t1, g_XMInfinity); + uint32x4_t isNaN = vbicq_u32(t1, t0); + + float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2); + float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN); + result = vbslq_f32(isPositive, result, tmp2); + result = vbslq_f32(isNaN, g_XMQNaN, result); + return result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_log2_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + __m128i biased = _mm_srli_epi32(rawBiased, 23); + __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias); + __m128i trailingNor = trailing; + + // Compute exponent and significand for subnormals. + __m128i leading = MathInternal::GetLeadingBit(trailing); + __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading); + __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift); + __m128i trailingSub = MathInternal::multi_sll_epi32(trailing, shift); + trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest); + + __m128i select0 = _mm_and_si128(isExponentZero, exponentSub); + __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor); + __m128i e = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isExponentZero, trailingSub); + select1 = _mm_andnot_si128(isExponentZero, trailingNor); + __m128i t = _mm_or_si128(select0, select1); + + // Compute the approximation. + __m128i tmp = _mm_or_si128(g_XMOne, t); + __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne); + + __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst5); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst4); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst3); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst2); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst1); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst0); + log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e)); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity); + + __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero); + __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity); + __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero); + + __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isZero = _mm_cmpeq_epi32(isZero, g_XMZero); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isInfinite, g_XMInfinity); + select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2)); + __m128i result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isZero, g_XMNegInfinity); + select1 = _mm_andnot_si128(isZero, g_XMNegQNaN); + tmp = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isPositive, result); + select1 = _mm_andnot_si128(isPositive, tmp); + result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result); + result = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLog10(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + log10f(V.vector4_f32[0]), + log10f(V.vector4_f32[1]), + log10f(V.vector4_f32[2]), + log10f(V.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest); + uint32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + int32x4_t biased = vshrq_n_s32(rawBiased, 23); + int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias); + int32x4_t trailingNor = trailing; + + // Compute exponent and significand for subnormals. + int32x4_t leading = MathInternal::GetLeadingBit(trailing); + int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading); + int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift); + int32x4_t trailingSub = vshlq_s32(trailing, shift); + trailingSub = vandq_s32(trailingSub, g_XMQNaNTest); + int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor); + int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor); + + // Compute the approximation. + int32x4_t tmp = vorrq_s32(g_XMOne, t); + float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne); + + float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y); + log2 = vmlaq_f32(g_XMLogEst5, log2, y); + log2 = vmlaq_f32(g_XMLogEst4, log2, y); + log2 = vmlaq_f32(g_XMLogEst3, log2, y); + log2 = vmlaq_f32(g_XMLogEst2, log2, y); + log2 = vmlaq_f32(g_XMLogEst1, log2, y); + log2 = vmlaq_f32(g_XMLogEst0, log2, y); + log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y); + + log2 = vmulq_f32(g_XMInvLg10, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isInfinite = vceqq_u32(isInfinite, g_XMInfinity); + + uint32x4_t isGreaterZero = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMZero); + uint32x4_t isNotFinite = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite); + + uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isZero = vceqq_u32(isZero, g_XMZero); + + uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest); + uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity); + t0 = vceqq_u32(t0, g_XMZero); + t1 = vceqq_u32(t1, g_XMInfinity); + uint32x4_t isNaN = vbicq_u32(t1, t0); + + float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2); + float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN); + result = vbslq_f32(isPositive, result, tmp2); + result = vbslq_f32(isNaN, g_XMQNaN, result); + return result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_log10_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + __m128i biased = _mm_srli_epi32(rawBiased, 23); + __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias); + __m128i trailingNor = trailing; + + // Compute exponent and significand for subnormals. + __m128i leading = MathInternal::GetLeadingBit(trailing); + __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading); + __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift); + __m128i trailingSub = MathInternal::multi_sll_epi32(trailing, shift); + trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest); + + __m128i select0 = _mm_and_si128(isExponentZero, exponentSub); + __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor); + __m128i e = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isExponentZero, trailingSub); + select1 = _mm_andnot_si128(isExponentZero, trailingNor); + __m128i t = _mm_or_si128(select0, select1); + + // Compute the approximation. + __m128i tmp = _mm_or_si128(g_XMOne, t); + __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne); + + __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst5); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst4); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst3); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst2); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst1); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst0); + log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e)); + + log2 = _mm_mul_ps(g_XMInvLg10, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity); + + __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero); + __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity); + __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero); + + __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isZero = _mm_cmpeq_epi32(isZero, g_XMZero); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isInfinite, g_XMInfinity); + select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2)); + __m128i result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isZero, g_XMNegInfinity); + select1 = _mm_andnot_si128(isZero, g_XMNegQNaN); + tmp = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isPositive, result); + select1 = _mm_andnot_si128(isPositive, tmp); + result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result); + result = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLogE(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + logf(V.vector4_f32[0]), + logf(V.vector4_f32[1]), + logf(V.vector4_f32[2]), + logf(V.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest); + uint32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + int32x4_t biased = vshrq_n_s32(rawBiased, 23); + int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias); + int32x4_t trailingNor = trailing; + + // Compute exponent and significand for subnormals. + int32x4_t leading = MathInternal::GetLeadingBit(trailing); + int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading); + int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift); + int32x4_t trailingSub = vshlq_s32(trailing, shift); + trailingSub = vandq_s32(trailingSub, g_XMQNaNTest); + int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor); + int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor); + + // Compute the approximation. + int32x4_t tmp = vorrq_s32(g_XMOne, t); + float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne); + + float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y); + log2 = vmlaq_f32(g_XMLogEst5, log2, y); + log2 = vmlaq_f32(g_XMLogEst4, log2, y); + log2 = vmlaq_f32(g_XMLogEst3, log2, y); + log2 = vmlaq_f32(g_XMLogEst2, log2, y); + log2 = vmlaq_f32(g_XMLogEst1, log2, y); + log2 = vmlaq_f32(g_XMLogEst0, log2, y); + log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y); + + log2 = vmulq_f32(g_XMInvLgE, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isInfinite = vceqq_u32(isInfinite, g_XMInfinity); + + uint32x4_t isGreaterZero = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMZero); + uint32x4_t isNotFinite = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite); + + uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isZero = vceqq_u32(isZero, g_XMZero); + + uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest); + uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity); + t0 = vceqq_u32(t0, g_XMZero); + t1 = vceqq_u32(t1, g_XMInfinity); + uint32x4_t isNaN = vbicq_u32(t1, t0); + + float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2); + float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN); + result = vbslq_f32(isPositive, result, tmp2); + result = vbslq_f32(isNaN, g_XMQNaN, result); + return result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_log_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + __m128i biased = _mm_srli_epi32(rawBiased, 23); + __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias); + __m128i trailingNor = trailing; + + // Compute exponent and significand for subnormals. + __m128i leading = MathInternal::GetLeadingBit(trailing); + __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading); + __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift); + __m128i trailingSub = MathInternal::multi_sll_epi32(trailing, shift); + trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest); + + __m128i select0 = _mm_and_si128(isExponentZero, exponentSub); + __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor); + __m128i e = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isExponentZero, trailingSub); + select1 = _mm_andnot_si128(isExponentZero, trailingNor); + __m128i t = _mm_or_si128(select0, select1); + + // Compute the approximation. + __m128i tmp = _mm_or_si128(g_XMOne, t); + __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne); + + __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst5); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst4); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst3); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst2); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst1); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst0); + log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e)); + + log2 = _mm_mul_ps(g_XMInvLgE, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity); + + __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero); + __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity); + __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero); + + __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isZero = _mm_cmpeq_epi32(isZero, g_XMZero); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isInfinite, g_XMInfinity); + select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2)); + __m128i result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isZero, g_XMNegInfinity); + select1 = _mm_andnot_si128(isZero, g_XMNegQNaN); + tmp = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isPositive, result); + select1 = _mm_andnot_si128(isPositive, tmp); + result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result); + result = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLog(FXMVECTOR V) noexcept +{ + return XMVectorLog2(V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorPow +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + powf(V1.vector4_f32[0], V2.vector4_f32[0]), + powf(V1.vector4_f32[1], V2.vector4_f32[1]), + powf(V1.vector4_f32[2], V2.vector4_f32[2]), + powf(V1.vector4_f32[3], V2.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { { { + powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)), + powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)), + powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)), + powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3)) + } } }; + return vResult.v; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_pow_ps(V1, V2); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + XM_ALIGNED_DATA(16) float a[4]; + XM_ALIGNED_DATA(16) float b[4]; + _mm_store_ps(a, V1); + _mm_store_ps(b, V2); + XMVECTOR vResult = _mm_setr_ps( + powf(a[0], b[0]), + powf(a[1], b[1]), + powf(a[2], b[2]), + powf(a[3], b[3])); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAbs(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + fabsf(V.vector4_f32[0]), + fabsf(V.vector4_f32[1]), + fabsf(V.vector4_f32[2]), + fabsf(V.vector4_f32[3]) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vabsq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_setzero_ps(); + vResult = _mm_sub_ps(vResult, V); + vResult = _mm_max_ps(vResult, V); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMod +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + // V1 % V2 = V1 - V2 * truncate(V1 / V2) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Quotient = XMVectorDivide(V1, V2); + Quotient = XMVectorTruncate(Quotient); + XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vResult = XMVectorDivide(V1, V2); + vResult = XMVectorTruncate(vResult); + return vmlsq_f32(V1, vResult, V2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_div_ps(V1, V2); + vResult = XMVectorTruncate(vResult); + return XM_FNMADD_PS(vResult, V2, V1); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorModAngles(FXMVECTOR Angles) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMVECTOR Result; + + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v); + V = XMVectorRound(V); + Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + XMVECTOR vResult = vmulq_f32(Angles, g_XMReciprocalTwoPi); + // Use the inline function due to complexity for rounding + vResult = XMVectorRound(vResult); + return vmlsq_f32(Angles, vResult, g_XMTwoPi); +#elif defined(_XM_SSE_INTRINSICS_) + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + XMVECTOR vResult = _mm_mul_ps(Angles, g_XMReciprocalTwoPi); + // Use the inline function due to complexity for rounding + vResult = XMVectorRound(vResult); + return XM_FNMADD_PS(vResult, g_XMTwoPi, Angles); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSin(FXMVECTOR V) noexcept +{ + // 11-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR SC1 = g_XMSinCoefficients1; + const XMVECTOR SC0 = g_XMSinCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, x); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_sin_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + __m128 sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR SC1 = g_XMSinCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(SC1, _MM_SHUFFLE(0, 0, 0, 0)); + const XMVECTOR SC0 = g_XMSinCoefficients0; + __m128 vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, x); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCos(FXMVECTOR V) noexcept +{ + // 10-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR CC1 = g_XMCosCoefficients1; + const XMVECTOR CC0 = g_XMCosCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, fsign); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_cos_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR CC1 = g_XMCosCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(CC1, _MM_SHUFFLE(0, 0, 0, 0)); + const XMVECTOR CC0 = g_XMCosCoefficients0; + __m128 vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, sign); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorSinCos +( + XMVECTOR* pSin, + XMVECTOR* pCos, + FXMVECTOR V +) noexcept +{ + assert(pSin != nullptr); + assert(pCos != nullptr); + + // 11/10-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Sin = { { { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + } } }; + + XMVECTORF32 Cos = { { { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + } } }; + + *pSin = Sin.v; + *pCos = Cos.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SC1 = g_XMSinCoefficients1; + const XMVECTOR SC0 = g_XMSinCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pSin = vmulq_f32(Result, x); + + // Compute polynomial approximation for cosine + const XMVECTOR CC1 = g_XMCosCoefficients1; + const XMVECTOR CC0 = g_XMCosCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); + Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pCos = vmulq_f32(Result, fsign); +#elif defined(_XM_SVML_INTRINSICS_) + *pSin = _mm_sincos_ps(pCos, V); +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation of sine + const XMVECTOR SC1 = g_XMSinCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(SC1, _MM_SHUFFLE(0, 0, 0, 0)); + const XMVECTOR SC0 = g_XMSinCoefficients0; + __m128 vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, x); + *pSin = Result; + + // Compute polynomial approximation of cosine + const XMVECTOR CC1 = g_XMCosCoefficients1; + vConstantsB = XM_PERMUTE_PS(CC1, _MM_SHUFFLE(0, 0, 0, 0)); + const XMVECTOR CC0 = g_XMCosCoefficients0; + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(3, 3, 3, 3)); + Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, sign); + *pCos = Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTan(FXMVECTOR V) noexcept +{ + // Cody and Waite algorithm to compute tangent. + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + tanf(V.vector4_f32[0]), + tanf(V.vector4_f32[1]), + tanf(V.vector4_f32[2]), + tanf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_tan_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 TanCoefficients0 = { { { 1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f } } }; + static const XMVECTORF32 TanCoefficients1 = { { { 4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f } } }; + static const XMVECTORF32 TanConstants = { { { 1.570796371f, 6.077100628e-11f, 0.000244140625f, 0.63661977228f /*2 / Pi*/ } } }; + static const XMVECTORU32 Mask = { { { 0x1, 0x1, 0x1, 0x1 } } }; + + XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v); + + XMVECTOR Zero = XMVectorZero(); + + XMVECTOR C0 = XMVectorSplatX(TanConstants.v); + XMVECTOR C1 = XMVectorSplatY(TanConstants.v); + XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v); + + XMVECTOR VA = XMVectorMultiply(V, TwoDivPi); + + VA = XMVectorRound(VA); + + XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V); + + XMVECTOR VB = XMVectorAbs(VA); + + VC = XMVectorNegativeMultiplySubtract(VA, C1, VC); + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + VB = vreinterpretq_f32_u32(vcvtq_u32_f32(VB)); +#elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + reinterpret_cast<__m128i*>(&VB)[0] = _mm_cvttps_epi32(VB); +#else + for (size_t i = 0; i < 4; i++) + { + VB.vector4_u32[i] = static_cast(VB.vector4_f32[i]); + } +#endif + + XMVECTOR VC2 = XMVectorMultiply(VC, VC); + + XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v); + XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v); + XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v); + XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v); + XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v); + XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v); + XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v); + XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v); + + XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v); + VBIsEven = XMVectorEqualInt(VBIsEven, Zero); + + XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6); + XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3); + N = XMVectorMultiplyAdd(VC2, N, T5); + D = XMVectorMultiplyAdd(VC2, D, T2); + N = XMVectorMultiply(VC2, N); + D = XMVectorMultiplyAdd(VC2, D, T1); + N = XMVectorMultiplyAdd(VC, N, VC); + XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon); + D = XMVectorMultiplyAdd(VC2, D, T0); + + N = XMVectorSelect(N, VC, VCNearZero); + D = XMVectorSelect(D, g_XMOne.v, VCNearZero); + + XMVECTOR R0 = XMVectorNegate(N); + XMVECTOR R1 = XMVectorDivide(N, D); + R0 = XMVectorDivide(D, R0); + + XMVECTOR VIsZero = XMVectorEqual(V, Zero); + + XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven); + + Result = XMVectorSelect(Result, Zero, VIsZero); + + return Result; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSinH(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + sinhf(V.vector4_f32[0]), + sinhf(V.vector4_f32[1]), + sinhf(V.vector4_f32[2]), + sinhf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + + return vsubq_f32(E1, E2); +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_sinh_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = XM_FMADD_PS(V, Scale, g_XMNegativeOne); + XMVECTOR V2 = XM_FNMADD_PS(V, Scale, g_XMNegativeOne); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + + return _mm_sub_ps(E1, E2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCosH(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + coshf(V.vector4_f32[0]), + coshf(V.vector4_f32[1]), + coshf(V.vector4_f32[2]), + coshf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + return vaddq_f32(E1, E2); +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_cosh_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = XM_FMADD_PS(V, Scale.v, g_XMNegativeOne.v); + XMVECTOR V2 = XM_FNMADD_PS(V, Scale.v, g_XMNegativeOne.v); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + return _mm_add_ps(E1, E2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTanH(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + tanhf(V.vector4_f32[0]), + tanhf(V.vector4_f32[1]), + tanhf(V.vector4_f32[2]), + tanhf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f } } }; // 2.0f / ln(2.0f) + + XMVECTOR E = vmulq_f32(V, Scale.v); + E = XMVectorExp(E); + E = vmlaq_f32(g_XMOneHalf.v, E, g_XMOneHalf.v); + E = XMVectorReciprocal(E); + return vsubq_f32(g_XMOne.v, E); +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_tanh_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f } } }; // 2.0f / ln(2.0f) + + XMVECTOR E = _mm_mul_ps(V, Scale.v); + E = XMVectorExp(E); + E = XM_FMADD_PS(E, g_XMOneHalf.v, g_XMOneHalf.v); + E = _mm_div_ps(g_XMOne.v, E); + return _mm_sub_ps(g_XMOne.v, E); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorASin(FXMVECTOR V) noexcept +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + asinf(V.vector4_f32[0]), + asinf(V.vector4_f32[1]), + asinf(V.vector4_f32[2]), + asinf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); + XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AC1), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); + t0 = vmlaq_f32(vConstants, t0, x); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); + t0 = vmlaq_f32(vConstants, t0, x); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32(nonnegative, t0, t1); + t0 = vsubq_f32(g_XMHalfPi, t0); + return t0; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_asin_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(3, 3, 3, 3)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(2, 2, 2, 2)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + t0 = _mm_sub_ps(g_XMHalfPi, t0); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorACos(FXMVECTOR V) noexcept +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + acosf(V.vector4_f32[0]), + acosf(V.vector4_f32[1]), + acosf(V.vector4_f32[2]), + acosf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); + XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AC1), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); + t0 = vmlaq_f32(vConstants, t0, x); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); + t0 = vmlaq_f32(vConstants, t0, x); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32(nonnegative, t0, t1); + return t0; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_acos_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(3, 3, 3, 3)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(2, 2, 2, 2)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V) noexcept +{ + // 17-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + atanf(V.vector4_f32[0]), + atanf(V.vector4_f32[1]), + atanf(V.vector4_f32[2]), + atanf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t absV = vabsq_f32(V); + float32x4_t invV = XMVectorReciprocal(V); + uint32x4_t comp = vcgtq_f32(V, g_XMOne); + float32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + comp = vcleq_f32(absV, g_XMOne); + sign = vbslq_f32(comp, g_XMZero, sign); + float32x4_t x = vbslq_f32(comp, V, invV); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR TC1 = g_XMATanCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(TC1), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + const XMVECTOR TC0 = g_XMATanCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, x); + + float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi); + result1 = vsubq_f32(result1, Result); + + comp = vceqq_f32(sign, g_XMZero); + Result = vbslq_f32(comp, Result, result1); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_atan_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 absV = XMVectorAbs(V); + __m128 invV = _mm_div_ps(g_XMOne, V); + __m128 comp = _mm_cmpgt_ps(V, g_XMOne); + __m128 select0 = _mm_and_ps(comp, g_XMOne); + __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + __m128 sign = _mm_or_ps(select0, select1); + comp = _mm_cmple_ps(absV, g_XMOne); + select0 = _mm_and_ps(comp, g_XMZero); + select1 = _mm_andnot_ps(comp, sign); + sign = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, V); + select1 = _mm_andnot_ps(comp, invV); + __m128 x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR TC1 = g_XMATanCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + const XMVECTOR TC0 = g_XMATanCoefficients0; + vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(3, 3, 3, 3)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + Result = XM_FMADD_PS(Result, x2, g_XMOne); + + Result = _mm_mul_ps(Result, x); + __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); + result1 = _mm_sub_ps(result1, Result); + + comp = _mm_cmpeq_ps(sign, g_XMZero); + select0 = _mm_and_ps(comp, Result); + select1 = _mm_andnot_ps(comp, result1); + Result = _mm_or_ps(select0, select1); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATan2 +( + FXMVECTOR Y, + FXMVECTOR X +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + atan2f(Y.vector4_f32[0], X.vector4_f32[0]), + atan2f(Y.vector4_f32[1], X.vector4_f32[1]), + atan2f(Y.vector4_f32[2], X.vector4_f32[2]), + atan2f(Y.vector4_f32[3], X.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_atan2_ps(Y, X); + return Result; +#else + + // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions: + + // Y == 0 and X is Negative -> Pi with the sign of Y + // y == 0 and x is positive -> 0 with the sign of y + // Y != 0 and X == 0 -> Pi / 2 with the sign of Y + // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y) + // X == -Infinity and Finite Y -> Pi with the sign of Y + // X == +Infinity and Finite Y -> 0 with the sign of Y + // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y + // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y + // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y + + static const XMVECTORF32 ATan2Constants = { { { XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f } } }; + + XMVECTOR Zero = XMVectorZero(); + XMVECTOR ATanResultValid = XMVectorTrueInt(); + + XMVECTOR Pi = XMVectorSplatX(ATan2Constants); + XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); + XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); + XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); + + XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); + XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); + XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); + XIsPositive = XMVectorEqualInt(XIsPositive, Zero); + XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); + XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); + + XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); + Pi = XMVectorOrInt(Pi, YSign); + PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); + PiOverFour = XMVectorOrInt(PiOverFour, YSign); + ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); + + XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); + XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); + XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); + XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); + XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); + ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); + + XMVECTOR V = XMVectorDivide(Y, X); + + XMVECTOR R0 = XMVectorATan(V); + + R1 = XMVectorSelect(Pi, g_XMNegativeZero, XIsPositive); + R2 = XMVectorAdd(R0, R1); + + return XMVectorSelect(Result, R2, ATanResultValid); + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSinEst(FXMVECTOR V) noexcept +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, x); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_sin_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + __m128 sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR SEC = g_XMSinCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, x); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCosEst(FXMVECTOR V) noexcept +{ + // 6-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, fsign); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_cos_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, sign); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorSinCosEst +( + XMVECTOR* pSin, + XMVECTOR* pCos, + FXMVECTOR V +) noexcept +{ + assert(pSin != nullptr); + assert(pCos != nullptr); + + // 7/6-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Sin = { { { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + } } }; + + XMVECTORF32 Cos = { { { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + } } }; + + *pSin = Sin.v; + *pCos = Cos.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pSin = vmulq_f32(Result, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); + Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pCos = vmulq_f32(Result, fsign); +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SEC = g_XMSinCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, x); + *pSin = Result; + + // Compute polynomial approximation for cosine + const XMVECTOR CEC = g_XMCosCoefficients1; + vConstantsB = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(3, 3, 3, 3)); + vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, sign); + *pCos = Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTanEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + tanf(V.vector4_f32[0]), + tanf(V.vector4_f32[1]), + tanf(V.vector4_f32[2]), + tanf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_tan_ps(V); + return Result; +#else + + XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v); + + XMVECTOR V1 = XMVectorMultiply(V, OneOverPi); + V1 = XMVectorRound(V1); + + V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V); + + XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v); + XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v); + XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v); + + XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2); + XMVECTOR V2 = XMVectorMultiply(V1, V1); + XMVECTOR V1T0 = XMVectorMultiply(V1, T0); + XMVECTOR V1T1 = XMVectorMultiply(V1, T1); + + XMVECTOR D = XMVectorReciprocalEst(V2T2); + XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0); + + return XMVectorMultiply(N, D); + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorASinEst(FXMVECTOR V) noexcept +{ + // 3-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result; + Result.f[0] = asinf(V.vector4_f32[0]); + Result.f[1] = asinf(V.vector4_f32[1]); + Result.f[2] = asinf(V.vector4_f32[2]); + Result.f[3] = asinf(V.vector4_f32[3]); + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); + t0 = vmlaq_f32(vConstants, t0, x); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32(nonnegative, t0, t1); + t0 = vsubq_f32(g_XMHalfPi, t0); + return t0; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_asin_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + t0 = _mm_sub_ps(g_XMHalfPi, t0); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorACosEst(FXMVECTOR V) noexcept +{ + // 3-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + acosf(V.vector4_f32[0]), + acosf(V.vector4_f32[1]), + acosf(V.vector4_f32[2]), + acosf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); + t0 = vmlaq_f32(vConstants, t0, x); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32(nonnegative, t0, t1); + return t0; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_acos_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATanEst(FXMVECTOR V) noexcept +{ + // 9-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + atanf(V.vector4_f32[0]), + atanf(V.vector4_f32[1]), + atanf(V.vector4_f32[2]), + atanf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t absV = vabsq_f32(V); + float32x4_t invV = XMVectorReciprocalEst(V); + uint32x4_t comp = vcgtq_f32(V, g_XMOne); + float32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + comp = vcleq_f32(absV, g_XMOne); + sign = vbslq_f32(comp, g_XMZero, sign); + float32x4_t x = vbslq_f32(comp, V, invV); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMATanEstCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(AEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + // ATanEstCoefficients0 is already splatted + Result = vmlaq_f32(g_XMATanEstCoefficients0, Result, x2); + Result = vmulq_f32(Result, x); + + float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi); + result1 = vsubq_f32(result1, Result); + + comp = vceqq_f32(sign, g_XMZero); + Result = vbslq_f32(comp, Result, result1); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_atan_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 absV = XMVectorAbs(V); + __m128 invV = _mm_div_ps(g_XMOne, V); + __m128 comp = _mm_cmpgt_ps(V, g_XMOne); + __m128 select0 = _mm_and_ps(comp, g_XMOne); + __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + __m128 sign = _mm_or_ps(select0, select1); + comp = _mm_cmple_ps(absV, g_XMOne); + select0 = _mm_and_ps(comp, g_XMZero); + select1 = _mm_andnot_ps(comp, sign); + sign = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, V); + select1 = _mm_andnot_ps(comp, invV); + __m128 x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMATanEstCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + // ATanEstCoefficients0 is already splatted + Result = XM_FMADD_PS(Result, x2, g_XMATanEstCoefficients0); + Result = _mm_mul_ps(Result, x); + __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); + result1 = _mm_sub_ps(result1, Result); + + comp = _mm_cmpeq_ps(sign, g_XMZero); + select0 = _mm_and_ps(comp, Result); + select1 = _mm_andnot_ps(comp, result1); + Result = _mm_or_ps(select0, select1); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATan2Est +( + FXMVECTOR Y, + FXMVECTOR X +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + atan2f(Y.vector4_f32[0], X.vector4_f32[0]), + atan2f(Y.vector4_f32[1], X.vector4_f32[1]), + atan2f(Y.vector4_f32[2], X.vector4_f32[2]), + atan2f(Y.vector4_f32[3], X.vector4_f32[3]), + } } }; + return Result.v; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_atan2_ps(Y, X); + return Result; +#else + + static const XMVECTORF32 ATan2Constants = { { { XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */ } } }; + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR ATanResultValid = XMVectorTrueInt(); + + XMVECTOR Pi = XMVectorSplatX(ATan2Constants); + XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); + XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); + XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); + + XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); + XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); + XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); + XIsPositive = XMVectorEqualInt(XIsPositive, Zero); + XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); + XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); + + XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); + Pi = XMVectorOrInt(Pi, YSign); + PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); + PiOverFour = XMVectorOrInt(PiOverFour, YSign); + ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); + + XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); + XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); + XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); + XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); + XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); + ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); + + XMVECTOR Reciprocal = XMVectorReciprocalEst(X); + XMVECTOR V = XMVectorMultiply(Y, Reciprocal); + XMVECTOR R0 = XMVectorATanEst(V); + + R1 = XMVectorSelect(Pi, g_XMNegativeZero, XIsPositive); + R2 = XMVectorAdd(R0, R1); + + Result = XMVectorSelect(Result, R2, ATanResultValid); + + return Result; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLerp +( + FXMVECTOR V0, + FXMVECTOR V1, + float t +) noexcept +{ + // V0 + t * (V1 - V0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Scale = XMVectorReplicate(t); + XMVECTOR Length = XMVectorSubtract(V1, V0); + return XMVectorMultiplyAdd(Length, Scale, V0); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR L = vsubq_f32(V1, V0); + return vmlaq_n_f32(V0, L, t); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR L = _mm_sub_ps(V1, V0); + XMVECTOR S = _mm_set_ps1(t); + return XM_FMADD_PS(L, S, V0); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLerpV +( + FXMVECTOR V0, + FXMVECTOR V1, + FXMVECTOR T +) noexcept +{ + // V0 + T * (V1 - V0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Length = XMVectorSubtract(V1, V0); + return XMVectorMultiplyAdd(Length, T, V0); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR L = vsubq_f32(V1, V0); + return vmlaq_f32(V0, L, T); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Length = _mm_sub_ps(V1, V0); + return XM_FMADD_PS(Length, T, V0); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorHermite +( + FXMVECTOR Position0, + FXMVECTOR Tangent0, + FXMVECTOR Position1, + GXMVECTOR Tangent1, + float t +) noexcept +{ + // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + + // (t^3 - 2 * t^2 + t) * Tangent0 + + // (-2 * t^3 + 3 * t^2) * Position1 + + // (t^3 - t^2) * Tangent1 + +#if defined(_XM_NO_INTRINSICS_) + + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f); + XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t); + XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2); + XMVECTOR T1 = XMVectorReplicate(t3 - t2); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(T0, Tangent0, Result); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(T1, Tangent1, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + float p0 = 2.0f * t3 - 3.0f * t2 + 1.0f; + float t0 = t3 - 2.0f * t2 + t; + float p1 = -2.0f * t3 + 3.0f * t2; + float t1 = t3 - t2; + + XMVECTOR vResult = vmulq_n_f32(Position0, p0); + vResult = vmlaq_n_f32(vResult, Tangent0, t0); + vResult = vmlaq_n_f32(vResult, Position1, p1); + vResult = vmlaq_n_f32(vResult, Tangent1, t1); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f); + XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t); + XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2); + XMVECTOR T1 = _mm_set_ps1(t3 - t2); + + XMVECTOR vResult = _mm_mul_ps(P0, Position0); + vResult = XM_FMADD_PS(Tangent0, T0, vResult); + vResult = XM_FMADD_PS(Position1, P1, vResult); + vResult = XM_FMADD_PS(Tangent1, T1, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorHermiteV +( + FXMVECTOR Position0, + FXMVECTOR Tangent0, + FXMVECTOR Position1, + GXMVECTOR Tangent1, + HXMVECTOR T +) noexcept +{ + // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + + // (t^3 - 2 * t^2 + t) * Tangent0 + + // (-2 * t^3 + 3 * t^2) * Position1 + + // (t^3 - t^2) * Tangent1 + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR T2 = XMVectorMultiply(T, T); + XMVECTOR T3 = XMVectorMultiply(T, T2); + + XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f); + XMVECTOR T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]); + XMVECTOR P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]); + XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(T0, Tangent0, Result); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(T1, Tangent1, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 CatMulT2 = { { { -3.0f, -2.0f, 3.0f, -1.0f } } }; + static const XMVECTORF32 CatMulT3 = { { { 2.0f, 1.0f, -2.0f, 1.0f } } }; + + XMVECTOR T2 = vmulq_f32(T, T); + XMVECTOR T3 = vmulq_f32(T, T2); + // Mul by the constants against t^2 + T2 = vmulq_f32(T2, CatMulT2); + // Mul by the constants against t^3 + T3 = vmlaq_f32(T2, T3, CatMulT3); + // T3 now has the pre-result. + // I need to add t.y only + T2 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T), g_XMMaskY)); + T3 = vaddq_f32(T3, T2); + // Add 1.0f to x + T3 = vaddq_f32(T3, g_XMIdentityR0); + // Now, I have the constants created + // Mul the x constant to Position0 + XMVECTOR vResult = vmulq_lane_f32(Position0, vget_low_f32(T3), 0); // T3[0] + // Mul the y constant to Tangent0 + vResult = vmlaq_lane_f32(vResult, Tangent0, vget_low_f32(T3), 1); // T3[1] + // Mul the z constant to Position1 + vResult = vmlaq_lane_f32(vResult, Position1, vget_high_f32(T3), 0); // T3[2] + // Mul the w constant to Tangent1 + vResult = vmlaq_lane_f32(vResult, Tangent1, vget_high_f32(T3), 1); // T3[3] + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 CatMulT2 = { { { -3.0f, -2.0f, 3.0f, -1.0f } } }; + static const XMVECTORF32 CatMulT3 = { { { 2.0f, 1.0f, -2.0f, 1.0f } } }; + + XMVECTOR T2 = _mm_mul_ps(T, T); + XMVECTOR T3 = _mm_mul_ps(T, T2); + // Mul by the constants against t^2 + T2 = _mm_mul_ps(T2, CatMulT2); + // Mul by the constants against t^3 + T3 = XM_FMADD_PS(T3, CatMulT3, T2); + // T3 now has the pre-result. + // I need to add t.y only + T2 = _mm_and_ps(T, g_XMMaskY); + T3 = _mm_add_ps(T3, T2); + // Add 1.0f to x + T3 = _mm_add_ps(T3, g_XMIdentityR0); + // Now, I have the constants created + // Mul the x constant to Position0 + XMVECTOR vResult = XM_PERMUTE_PS(T3, _MM_SHUFFLE(0, 0, 0, 0)); + vResult = _mm_mul_ps(vResult, Position0); + // Mul the y constant to Tangent0 + T2 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(1, 1, 1, 1)); + vResult = XM_FMADD_PS(T2, Tangent0, vResult); + // Mul the z constant to Position1 + T2 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(2, 2, 2, 2)); + vResult = XM_FMADD_PS(T2, Position1, vResult); + // Mul the w constant to Tangent1 + T3 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(3, 3, 3, 3)); + vResult = XM_FMADD_PS(T3, Tangent1, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCatmullRom +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR Position3, + float t +) noexcept +{ + // Result = ((-t^3 + 2 * t^2 - t) * Position0 + + // (3 * t^3 - 5 * t^2 + 2) * Position1 + + // (-3 * t^3 + 4 * t^2 + t) * Position2 + + // (t^3 - t^2) * Position3) * 0.5 + +#if defined(_XM_NO_INTRINSICS_) + + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f); + XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); + XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); + XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(P2, Position2, Result); + Result = XMVectorMultiplyAdd(P3, Position3, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + float p0 = (-t3 + 2.0f * t2 - t) * 0.5f; + float p1 = (3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f; + float p2 = (-3.0f * t3 + 4.0f * t2 + t) * 0.5f; + float p3 = (t3 - t2) * 0.5f; + + XMVECTOR P1 = vmulq_n_f32(Position1, p1); + XMVECTOR P0 = vmlaq_n_f32(P1, Position0, p0); + XMVECTOR P3 = vmulq_n_f32(Position3, p3); + XMVECTOR P2 = vmlaq_n_f32(P3, Position2, p2); + P0 = vaddq_f32(P0, P2); + return P0; +#elif defined(_XM_SSE_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f); + XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); + XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); + XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f); + + P1 = _mm_mul_ps(Position1, P1); + P0 = XM_FMADD_PS(Position0, P0, P1); + P3 = _mm_mul_ps(Position3, P3); + P2 = XM_FMADD_PS(Position2, P2, P3); + P0 = _mm_add_ps(P0, P2); + return P0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCatmullRomV +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR Position3, + HXMVECTOR T +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float fx = T.vector4_f32[0]; + float fy = T.vector4_f32[1]; + float fz = T.vector4_f32[2]; + float fw = T.vector4_f32[3]; + XMVECTORF32 vResult = { { { + 0.5f * ((-fx * fx * fx + 2 * fx * fx - fx) * Position0.vector4_f32[0] + + (3 * fx * fx * fx - 5 * fx * fx + 2) * Position1.vector4_f32[0] + + (-3 * fx * fx * fx + 4 * fx * fx + fx) * Position2.vector4_f32[0] + + (fx * fx * fx - fx * fx) * Position3.vector4_f32[0]), + + 0.5f * ((-fy * fy * fy + 2 * fy * fy - fy) * Position0.vector4_f32[1] + + (3 * fy * fy * fy - 5 * fy * fy + 2) * Position1.vector4_f32[1] + + (-3 * fy * fy * fy + 4 * fy * fy + fy) * Position2.vector4_f32[1] + + (fy * fy * fy - fy * fy) * Position3.vector4_f32[1]), + + 0.5f * ((-fz * fz * fz + 2 * fz * fz - fz) * Position0.vector4_f32[2] + + (3 * fz * fz * fz - 5 * fz * fz + 2) * Position1.vector4_f32[2] + + (-3 * fz * fz * fz + 4 * fz * fz + fz) * Position2.vector4_f32[2] + + (fz * fz * fz - fz * fz) * Position3.vector4_f32[2]), + + 0.5f * ((-fw * fw * fw + 2 * fw * fw - fw) * Position0.vector4_f32[3] + + (3 * fw * fw * fw - 5 * fw * fw + 2) * Position1.vector4_f32[3] + + (-3 * fw * fw * fw + 4 * fw * fw + fw) * Position2.vector4_f32[3] + + (fw * fw * fw - fw * fw) * Position3.vector4_f32[3]) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Catmul2 = { { { 2.0f, 2.0f, 2.0f, 2.0f } } }; + static const XMVECTORF32 Catmul3 = { { { 3.0f, 3.0f, 3.0f, 3.0f } } }; + static const XMVECTORF32 Catmul4 = { { { 4.0f, 4.0f, 4.0f, 4.0f } } }; + static const XMVECTORF32 Catmul5 = { { { 5.0f, 5.0f, 5.0f, 5.0f } } }; + // Cache T^2 and T^3 + XMVECTOR T2 = vmulq_f32(T, T); + XMVECTOR T3 = vmulq_f32(T, T2); + // Perform the Position0 term + XMVECTOR vResult = vaddq_f32(T2, T2); + vResult = vsubq_f32(vResult, T); + vResult = vsubq_f32(vResult, T3); + vResult = vmulq_f32(vResult, Position0); + // Perform the Position1 term and add + XMVECTOR vTemp = vmulq_f32(T3, Catmul3); + vTemp = vmlsq_f32(vTemp, T2, Catmul5); + vTemp = vaddq_f32(vTemp, Catmul2); + vResult = vmlaq_f32(vResult, vTemp, Position1); + // Perform the Position2 term and add + vTemp = vmulq_f32(T2, Catmul4); + vTemp = vmlsq_f32(vTemp, T3, Catmul3); + vTemp = vaddq_f32(vTemp, T); + vResult = vmlaq_f32(vResult, vTemp, Position2); + // Position3 is the last term + T3 = vsubq_f32(T3, T2); + vResult = vmlaq_f32(vResult, T3, Position3); + // Multiply by 0.5f and exit + vResult = vmulq_f32(vResult, g_XMOneHalf); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Catmul2 = { { { 2.0f, 2.0f, 2.0f, 2.0f } } }; + static const XMVECTORF32 Catmul3 = { { { 3.0f, 3.0f, 3.0f, 3.0f } } }; + static const XMVECTORF32 Catmul4 = { { { 4.0f, 4.0f, 4.0f, 4.0f } } }; + static const XMVECTORF32 Catmul5 = { { { 5.0f, 5.0f, 5.0f, 5.0f } } }; + // Cache T^2 and T^3 + XMVECTOR T2 = _mm_mul_ps(T, T); + XMVECTOR T3 = _mm_mul_ps(T, T2); + // Perform the Position0 term + XMVECTOR vResult = _mm_add_ps(T2, T2); + vResult = _mm_sub_ps(vResult, T); + vResult = _mm_sub_ps(vResult, T3); + vResult = _mm_mul_ps(vResult, Position0); + // Perform the Position1 term and add + XMVECTOR vTemp = _mm_mul_ps(T3, Catmul3); + vTemp = XM_FNMADD_PS(T2, Catmul5, vTemp); + vTemp = _mm_add_ps(vTemp, Catmul2); + vResult = XM_FMADD_PS(vTemp, Position1, vResult); + // Perform the Position2 term and add + vTemp = _mm_mul_ps(T2, Catmul4); + vTemp = XM_FNMADD_PS(T3, Catmul3, vTemp); + vTemp = _mm_add_ps(vTemp, T); + vResult = XM_FMADD_PS(vTemp, Position2, vResult); + // Position3 is the last term + T3 = _mm_sub_ps(T3, T2); + vResult = XM_FMADD_PS(T3, Position3, vResult); + // Multiply by 0.5f and exit + vResult = _mm_mul_ps(vResult, g_XMOneHalf); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorBaryCentric +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + float f, + float g +) noexcept +{ + // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P10 = XMVectorSubtract(Position1, Position0); + XMVECTOR ScaleF = XMVectorReplicate(f); + + XMVECTOR P20 = XMVectorSubtract(Position2, Position0); + XMVECTOR ScaleG = XMVectorReplicate(g); + + XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0); + Result = XMVectorMultiplyAdd(P20, ScaleG, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR R1 = vsubq_f32(Position1, Position0); + XMVECTOR R2 = vsubq_f32(Position2, Position0); + R1 = vmlaq_n_f32(Position0, R1, f); + return vmlaq_n_f32(R1, R2, g); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R1 = _mm_sub_ps(Position1, Position0); + XMVECTOR R2 = _mm_sub_ps(Position2, Position0); + XMVECTOR SF = _mm_set_ps1(f); + R1 = XM_FMADD_PS(R1, SF, Position0); + XMVECTOR SG = _mm_set_ps1(g); + return XM_FMADD_PS(R2, SG, R1); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorBaryCentricV +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR F, + HXMVECTOR G +) noexcept +{ + // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P10 = XMVectorSubtract(Position1, Position0); + XMVECTOR P20 = XMVectorSubtract(Position2, Position0); + + XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0); + Result = XMVectorMultiplyAdd(P20, G, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR R1 = vsubq_f32(Position1, Position0); + XMVECTOR R2 = vsubq_f32(Position2, Position0); + R1 = vmlaq_f32(Position0, R1, F); + return vmlaq_f32(R1, R2, G); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R1 = _mm_sub_ps(Position1, Position0); + XMVECTOR R2 = _mm_sub_ps(Position2, Position0); + R1 = XM_FMADD_PS(R1, F, Position0); + return XM_FMADD_PS(R2, G, R1); +#endif +} + +/**************************************************************************** + * + * 2D Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + // z and w are don't care + return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0); +#endif +} + + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2)); + uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0); + uint32_t CR = 0; + if (r == 0xFFFFFFFFFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + // z and w are don't care + int iTest = _mm_movemask_ps(vTemp) & 3; + uint32_t CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)), vget_low_u32(vreinterpretq_u32_f32(V2))); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3) == 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && + (V1.vector4_u32[1] == V2.vector4_u32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && + (V1.vector4_u32[1] != V2.vector4_u32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)), vget_low_u32(vreinterpretq_u32_f32(V2))); + uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0); + uint32_t CR = 0; + if (r == 0xFFFFFFFFFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3; + uint32_t CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]); + float dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]); + return ((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t vDelta = vsub_f32(vget_low_f32(V1), vget_low_f32(V2)); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + uint32x2_t vTemp = vacle_f32(vDelta, vget_low_u32(Epsilon)); +#else + uint32x2_t vTemp = vcle_f32(vabs_f32(vDelta), vget_low_f32(Epsilon)); +#endif + uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0); + return (r == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1, V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp, vDelta); + vTemp = _mm_max_ps(vTemp, vDelta); + vTemp = _mm_cmple_ps(vTemp, Epsilon); + // z and w are don't care + return (((_mm_movemask_ps(vTemp) & 3) == 0x3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + // z and w are don't care + return (((_mm_movemask_ps(vTemp) & 3) != 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)), vget_low_u32(vreinterpretq_u32_f32(V2))); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3) != 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcgt_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + // z and w are don't care + return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && + (V1.vector4_f32[1] > V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && + (V1.vector4_f32[1] <= V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcgt_f32(vget_low_f32(V1), vget_low_f32(V2)); + uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0); + uint32_t CR = 0; + if (r == 0xFFFFFFFFFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp) & 3; + uint32_t CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcge_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcge_f32(vget_low_f32(V1), vget_low_f32(V2)); + uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0); + uint32_t CR = 0; + if (r == 0xFFFFFFFFFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp) & 3; + uint32_t CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vclt_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcle_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + float32x2_t B = vget_low_f32(Bounds); + // Test if less than or equal + uint32x2_t ivTemp1 = vcle_f32(VL, B); + // Negate the bounds + float32x2_t vTemp2 = vneg_f32(B); + // Test if greater or equal (Reversed) + uint32x2_t ivTemp2 = vcle_f32(vTemp2, VL); + // Blend answers + ivTemp1 = vand_u32(ivTemp1, ivTemp2); + // x and y in bounds? + return (vget_lane_u64(vreinterpret_u64_u32(ivTemp1), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2, V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + // x and y in bounds? (z and w are don't care) + return (((_mm_movemask_ps(vTemp1) & 0x3) == 0x3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline bool XM_CALLCONV XMVector2IsNaN(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(__clang__) && defined(__FINITE_MATH_ONLY__) + return isnan(vgetq_lane_f32(V, 0)) || isnan(vgetq_lane_f32(V, 1)); +#else + float32x2_t VL = vget_low_f32(V); + // Test against itself. NaN is always not equal + uint32x2_t vTempNan = vceq_f32(VL, VL); + // If x or y are NaN, the mask is zero + return (vget_lane_u64(vreinterpret_u64_u32(vTempNan), 0) != 0xFFFFFFFFFFFFFFFFU); +#endif +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(__clang__) && defined(__FINITE_MATH_ONLY__) + XM_ALIGNED_DATA(16) float tmp[4]; + _mm_store_ps(tmp, V); + return isnan(tmp[0]) || isnan(tmp[1]); +#else +// Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V, V); + // If x or y are NaN, the mask is non-zero + return ((_mm_movemask_ps(vTempNan) & 3) != 0); +#endif +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2IsInfinite(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x2_t vTemp = vand_u32(vget_low_u32(vreinterpretq_u32_f32(V)), vget_low_u32(g_XMAbsMask)); + // Compare to infinity + vTemp = vceq_f32(vreinterpret_f32_u32(vTemp), vget_low_f32(g_XMInfinity)); + // If any are infinity, the signs are true. + return vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0; +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V, g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity); + // If x or z are infinity, the signs are true. + return ((_mm_movemask_ps(vTemp) & 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result; + Result.f[0] = + Result.f[1] = + Result.f[2] = + Result.f[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1]; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Perform the dot product on x and y + float32x2_t vTemp = vmul_f32(vget_low_f32(V1), vget_low_f32(V2)); + vTemp = vpadd_f32(vTemp, vTemp); + return vcombine_f32(vTemp, vTemp); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_dp_ps(V1, V2, 0x3f); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V1, V2); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_moveldup_ps(vDot); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V1, V2); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Cross +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ] + +#if defined(_XM_NO_INTRINSICS_) + float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]); + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = fCross; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { { { 1.f, -1.f, 0, 0 } } }; + + float32x2_t vTemp = vmul_f32(vget_low_f32(V1), vrev64_f32(vget_low_f32(V2))); + vTemp = vmul_f32(vTemp, vget_low_f32(Negate)); + vTemp = vpadd_f32(vTemp, vTemp); + return vcombine_f32(vTemp, vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap x and y + XMVECTOR vResult = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 1, 0, 1)); + // Perform the muls + vResult = _mm_mul_ps(vResult, V1); + // Splat y + XMVECTOR vTemp = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(1, 1, 1, 1)); + // Sub the values + vResult = _mm_sub_ss(vResult, vTemp); + // Splat the cross product + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 0, 0, 0)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2LengthSq(FXMVECTOR V) noexcept +{ + return XMVector2Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + // Reciprocal sqrt (estimate) + vTemp = vrsqrte_f32(vTemp); + return vcombine_f32(vTemp, vTemp); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + return _mm_rsqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ss(vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + // Reciprocal sqrt + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32(vTemp, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(vTemp, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp); + return _mm_div_ps(g_XMOne, vLengthSq); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = _mm_div_ss(g_XMOne, vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = _mm_sqrt_ss(vLengthSq); + vLengthSq = _mm_div_ss(g_XMOne, vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2LengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorSqrtEst(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(vTemp, zero); + // Sqrt (estimate) + float32x2_t Result = vrsqrte_f32(vTemp); + Result = vmul_f32(vTemp, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = _mm_sqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Length(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorSqrt(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(vTemp, zero); + // Sqrt + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32(vTemp, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(vTemp, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + Result = vmul_f32(vTemp, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ +// XMVector2NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + // Reciprocal sqrt (estimate) + vTemp = vrsqrte_f32(vTemp); + // Normalize + float32x2_t Result = vmul_f32(VL, vTemp); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + XMVECTOR vResult = _mm_rsqrt_ps(vTemp); + return _mm_mul_ps(vResult, V); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + vLengthSq = _mm_mul_ps(vLengthSq, V); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + vLengthSq = _mm_mul_ps(vLengthSq, V); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Normalize(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR vResult = XMVector2Length(V); + float fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) + { + fLength = 1.0f / fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0] * fLength; + vResult.vector4_f32[1] = V.vector4_f32[1] * fLength; + vResult.vector4_f32[2] = V.vector4_f32[2] * fLength; + vResult.vector4_f32[3] = V.vector4_f32[3] * fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + uint32x2_t VEqualsZero = vceq_f32(vTemp, vdup_n_f32(0)); + uint32x2_t VEqualsInf = vceq_f32(vTemp, vget_low_f32(g_XMInfinity)); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32(vTemp, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(vTemp, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + vTemp = vmul_f32(S1, R1); + // Normalize + float32x2_t Result = vmul_f32(VL, vTemp); + Result = vbsl_f32(VEqualsZero, vdup_n_f32(0), Result); + Result = vbsl_f32(VEqualsInf, vget_low_f32(g_XMQNaN), Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0x3f); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE3_INTRINSICS_) + // Perform the dot product on x and y only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_moveldup_ps(vLengthSq); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) noexcept +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + return XMVector2ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) noexcept +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero)); + assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero)); + assert(XMVector2GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector2LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) noexcept +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result; + Result = XMVector2Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) noexcept +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector2RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +// Return the refraction of a 2D vector +inline XMVECTOR XM_CALLCONV XMVector2RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) noexcept +{ + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + +#if defined(_XM_NO_INTRINSICS_) + + float IDotN = (Incident.vector4_f32[0] * Normal.vector4_f32[0]) + (Incident.vector4_f32[1] * Normal.vector4_f32[1]); + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float RY = 1.0f - (IDotN * IDotN); + float RX = 1.0f - (RY * RefractionIndex.vector4_f32[0] * RefractionIndex.vector4_f32[0]); + RY = 1.0f - (RY * RefractionIndex.vector4_f32[1] * RefractionIndex.vector4_f32[1]); + if (RX >= 0.0f) + { + RX = (RefractionIndex.vector4_f32[0] * Incident.vector4_f32[0]) - (Normal.vector4_f32[0] * ((RefractionIndex.vector4_f32[0] * IDotN) + sqrtf(RX))); + } + else + { + RX = 0.0f; + } + if (RY >= 0.0f) + { + RY = (RefractionIndex.vector4_f32[1] * Incident.vector4_f32[1]) - (Normal.vector4_f32[1] * ((RefractionIndex.vector4_f32[1] * IDotN) + sqrtf(RY))); + } + else + { + RY = 0.0f; + } + + XMVECTOR vResult; + vResult.vector4_f32[0] = RX; + vResult.vector4_f32[1] = RY; + vResult.vector4_f32[2] = 0.0f; + vResult.vector4_f32[3] = 0.0f; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t IL = vget_low_f32(Incident); + float32x2_t NL = vget_low_f32(Normal); + float32x2_t RIL = vget_low_f32(RefractionIndex); + // Get the 2D Dot product of Incident-Normal + float32x2_t vTemp = vmul_f32(IL, NL); + float32x2_t IDotN = vpadd_f32(vTemp, vTemp); + // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + vTemp = vmls_f32(vget_low_f32(g_XMOne), IDotN, IDotN); + vTemp = vmul_f32(vTemp, RIL); + vTemp = vmls_f32(vget_low_f32(g_XMOne), vTemp, RIL); + // If any terms are <=0, sqrt() will fail, punt to zero + uint32x2_t vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero)); + // Sqrt(vTemp) + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32(vTemp, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(vTemp, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t S2 = vmul_f32(S1, R1); + vTemp = vmul_f32(vTemp, S2); + // R = RefractionIndex * IDotN + sqrt(R) + vTemp = vmla_f32(vTemp, RIL, IDotN); + // Result = RefractionIndex * Incident - Normal * R + float32x2_t vResult = vmul_f32(RIL, IL); + vResult = vmls_f32(vResult, vTemp, NL); + vResult = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(vResult), vMask)); + return vcombine_f32(vResult, vResult); +#elif defined(_XM_SSE_INTRINSICS_) + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + // Get the 2D Dot product of Incident-Normal + XMVECTOR IDotN = XMVector2Dot(Incident, Normal); + // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR vTemp = XM_FNMADD_PS(IDotN, IDotN, g_XMOne); + vTemp = _mm_mul_ps(vTemp, RefractionIndex); + vTemp = XM_FNMADD_PS(vTemp, RefractionIndex, g_XMOne); + // If any terms are <=0, sqrt() will fail, punt to zero + XMVECTOR vMask = _mm_cmpgt_ps(vTemp, g_XMZero); + // R = RefractionIndex * IDotN + sqrt(R) + vTemp = _mm_sqrt_ps(vTemp); + vTemp = XM_FMADD_PS(RefractionIndex, IDotN, vTemp); + // Result = RefractionIndex * Incident - Normal * R + XMVECTOR vResult = _mm_mul_ps(RefractionIndex, Incident); + vResult = XM_FNMADD_PS(vTemp, Normal, vResult); + vResult = _mm_and_ps(vResult, vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Orthogonal(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + -V.vector4_f32[1], + V.vector4_f32[0], + 0.f, + 0.f + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { { { -1.f, 1.f, 0, 0 } } }; + const float32x2_t zero = vdup_n_f32(0); + + float32x2_t VL = vget_low_f32(V); + float32x2_t Result = vmul_f32(vrev64_f32(VL), vget_low_f32(Negate)); + return vcombine_f32(Result, zero); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); + vResult = _mm_mul_ps(vResult, g_XMNegateX); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector2Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector2Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne); + Result = XMVectorACos(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + XMVECTOR L1 = XMVector2ReciprocalLength(V1); + XMVECTOR L2 = XMVector2ReciprocalLength(V2); + + XMVECTOR Dot = XMVector2Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2LinePointDistance +( + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2, + FXMVECTOR Point +) noexcept +{ + // Given a vector PointVector from LinePoint1 to Point and a vector + // LineVector from LinePoint1 to LinePoint2, the scaled distance + // PointProjectionScale from LinePoint1 to the perpendicular projection + // of PointVector onto the line is defined as: + // + // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) + + XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); + XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); + + XMVECTOR LengthSq = XMVector2LengthSq(LineVector); + + XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector); + PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); + + XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); + DistanceVector = XMVectorSubtract(PointVector, DistanceVector); + + return XMVector2Length(DistanceVector); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2IntersectLine +( + FXMVECTOR Line1Point1, + FXMVECTOR Line1Point2, + FXMVECTOR Line2Point1, + GXMVECTOR Line2Point2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1); + XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1); + XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1); + + XMVECTOR C1 = XMVector2Cross(V1, V2); + XMVECTOR C2 = XMVector2Cross(V2, V3); + + XMVECTOR Result; + const XMVECTOR Zero = XMVectorZero(); + if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v)) + { + if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v)) + { + // Coincident + Result = g_XMInfinity.v; + } + else + { + // Parallel + Result = g_XMQNaN.v; + } + } + else + { + // Intersection point = Line1Point1 + V1 * (C2 / C1) + XMVECTOR Scale = XMVectorReciprocal(C1); + Scale = XMVectorMultiply(C2, Scale); + Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1); + } + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1); + XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1); + XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1); + // Generate the cross products + XMVECTOR C1 = XMVector2Cross(V1, V2); + XMVECTOR C2 = XMVector2Cross(V2, V3); + // If C1 is not close to epsilon, use the calculated value + XMVECTOR vResultMask = _mm_setzero_ps(); + vResultMask = _mm_sub_ps(vResultMask, C1); + vResultMask = _mm_max_ps(vResultMask, C1); + // 0xFFFFFFFF if the calculated value is to be used + vResultMask = _mm_cmpgt_ps(vResultMask, g_XMEpsilon); + // If C1 is close to epsilon, which fail type is it? INFINITY or NAN? + XMVECTOR vFailMask = _mm_setzero_ps(); + vFailMask = _mm_sub_ps(vFailMask, C2); + vFailMask = _mm_max_ps(vFailMask, C2); + vFailMask = _mm_cmple_ps(vFailMask, g_XMEpsilon); + XMVECTOR vFail = _mm_and_ps(vFailMask, g_XMInfinity); + vFailMask = _mm_andnot_ps(vFailMask, g_XMQNaN); + // vFail is NAN or INF + vFail = _mm_or_ps(vFail, vFailMask); + // Intersection point = Line1Point1 + V1 * (C2 / C1) + XMVECTOR vResult = _mm_div_ps(C2, C1); + vResult = XM_FMADD_PS(vResult, V1, Line1Point1); + // Use result, or failure value + vResult = _mm_and_ps(vResult, vResultMask); + vResultMask = _mm_andnot_ps(vResultMask, vFail); + vResult = _mm_or_ps(vResult, vResultMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Transform +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + float32x4_t Result = vmlaq_lane_f32(M.r[3], M.r[1], VL, 1); // Y + return vmlaq_lane_f32(Result, M.r[0], VL, 0); // X +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y + vResult = XM_FMADD_PS(vResult, M.r[1], M.r[3]); + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X + vResult = XM_FMADD_PS(vTemp, M.r[0], vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); + + assert(OutputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2(reinterpret_cast(pInputVector)); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); + Result = XMVectorMultiplyAdd(X, row0, Result); + + #ifdef _PREFAST_ + #pragma prefast(push) + #pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) + #endif + + XMStoreFloat4(reinterpret_cast(pOutputVector), Result); + + #ifdef _PREFAST_ + #pragma prefast(pop) + #endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT4))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x2_t V = vld2q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + float32x2_t r3 = vget_low_f32(row3); + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(row3); + r = vget_high_f32(row0); + XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O + XMVECTOR vResult3 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O + vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + float32x4x4_t R; + R.val[0] = vResult0; + R.val[1] = vResult1; + R.val[2] = vResult2; + R.val[3] = vResult3; + + vst4q_f32(reinterpret_cast(pOutputVector), R); + pOutputVector += sizeof(XMFLOAT4) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t V = vld1_f32(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32(row3, row0, V, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, V, 1); // Y + + vst1q_f32(reinterpret_cast(pOutputVector), vResult); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_AVX2_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + __m256 row0 = _mm256_broadcast_ps(&M.r[0]); + __m256 row1 = _mm256_broadcast_ps(&M.r[1]); + __m256 row3 = _mm256_broadcast_ps(&M.r[3]); + + if (InputStride == sizeof(XMFLOAT2)) + { + if (OutputStride == sizeof(XMFLOAT4)) + { + if (!(reinterpret_cast(pOutputStream) & 0x1F)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + X1 = _mm256_insertf128_ps(vTempA, _mm256_castps256_ps128(vTempA2), 1); + XM256_STREAM_PS(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT4) * 2; + + X2 = _mm256_insertf128_ps(vTempA2, _mm256_extractf128_ps(vTempA, 1), 0); + XM256_STREAM_PS(reinterpret_cast(pOutputVector), X2); + pOutputVector += sizeof(XMFLOAT4) * 2; + + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + X1 = _mm256_insertf128_ps(vTempA, _mm256_castps256_ps128(vTempA2), 1); + _mm256_storeu_ps(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT4) * 2; + + X2 = _mm256_insertf128_ps(vTempA2, _mm256_extractf128_ps(vTempA, 1), 0); + _mm256_storeu_ps(reinterpret_cast(pOutputVector), X2); + pOutputVector += sizeof(XMFLOAT4) * 2; + + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_castps256_ps128(vTempA)); + pOutputVector += OutputStride; + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_castps256_ps128(vTempA2)); + pOutputVector += OutputStride; + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_extractf128_ps(vTempA, 1)); + pOutputVector += OutputStride; + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_extractf128_ps(vTempA2, 1)); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + if (i < VectorCount) + { + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t two = VectorCount >> 1; + if (two > 0) + { + if (InputStride == sizeof(XMFLOAT2)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) + { + // Packed input, aligned output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = XM_FMADD_PS(Y, row1, row3); + vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 2; + } + } + else + { + // Packed input, unaligned output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = XM_FMADD_PS(Y, row1, row3); + vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 2; + } + } + } + } + + if (!(reinterpret_cast(pInputVector) & 0xF) && !(InputStride & 0xF)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) + { + // Aligned input, aligned output + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + else + { + // Aligned input, unaligned output + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + } + else + { + // Unaligned input + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2TransformCoord +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMVECTOR W = XMVectorSplatW(Result); + return XMVectorDivide(Result, W); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream +( + XMFLOAT2* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); + + assert(OutputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2(reinterpret_cast(pInputVector)); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMVECTOR W = XMVectorSplatW(Result); + + Result = XMVectorDivide(Result, W); + + #ifdef _PREFAST_ + #pragma prefast(push) + #pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) + #endif + + XMStoreFloat2(reinterpret_cast(pOutputVector), Result); + + #ifdef _PREFAST_ + #pragma prefast(pop) + #endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x2_t V = vld2q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + float32x2_t r3 = vget_low_f32(row3); + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(row3); + r = vget_high_f32(row0); + XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + W = vmlaq_lane_f32(W, V.val[1], r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + V.val[0] = vdivq_f32(vResult0, W); + V.val[1] = vdivq_f32(vResult1, W); + #else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + V.val[0] = vmulq_f32(vResult0, Reciprocal); + V.val[1] = vmulq_f32(vResult1, Reciprocal); + #endif + + vst2q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t V = vld1_f32(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32(row3, row0, V, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, V, 1); // Y + + V = vget_high_f32(vResult); + float32x2_t W = vdup_lane_f32(V, 1); + + #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + V = vget_low_f32(vResult); + V = vdiv_f32(V, W); + #else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x2_t Reciprocal = vrecpe_f32(W); + float32x2_t S = vrecps_f32(Reciprocal, W); + Reciprocal = vmul_f32(S, Reciprocal); + S = vrecps_f32(Reciprocal, W); + Reciprocal = vmul_f32(S, Reciprocal); + + V = vget_low_f32(vResult); + V = vmul_f32(V, Reciprocal); + #endif + + vst1_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_AVX2_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + __m256 row0 = _mm256_broadcast_ps(&M.r[0]); + __m256 row1 = _mm256_broadcast_ps(&M.r[1]); + __m256 row3 = _mm256_broadcast_ps(&M.r[3]); + + if (InputStride == sizeof(XMFLOAT2)) + { + if (OutputStride == sizeof(XMFLOAT2)) + { + if (!(reinterpret_cast(pOutputStream) & 0x1F)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + __m256 W = _mm256_shuffle_ps(vTempA, vTempA, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA = _mm256_div_ps(vTempA, W); + + W = _mm256_shuffle_ps(vTempA2, vTempA2, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA2 = _mm256_div_ps(vTempA2, W); + + X1 = _mm256_shuffle_ps(vTempA, vTempA2, 0x44); + XM256_STREAM_PS(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + __m256 W = _mm256_shuffle_ps(vTempA, vTempA, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA = _mm256_div_ps(vTempA, W); + + W = _mm256_shuffle_ps(vTempA2, vTempA2, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA2 = _mm256_div_ps(vTempA2, W); + + X1 = _mm256_shuffle_ps(vTempA, vTempA2, 0x44); + _mm256_storeu_ps(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + __m256 W = _mm256_shuffle_ps(vTempA, vTempA, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA = _mm256_div_ps(vTempA, W); + + W = _mm256_shuffle_ps(vTempA2, vTempA2, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA2 = _mm256_div_ps(vTempA2, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_castps256_ps128(vTempA))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_castps256_ps128(vTempA2))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_extractf128_ps(vTempA, 1))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_extractf128_ps(vTempA2, 1))); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + if (i < VectorCount) + { + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t two = VectorCount >> 1; + if (two > 0) + { + if (InputStride == sizeof(XMFLOAT2)) + { + if (OutputStride == sizeof(XMFLOAT2)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + XMVECTOR V1 = _mm_div_ps(vTemp, W); + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = XM_FMADD_PS(Y, row1, row3); + vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + XMVECTOR V2 = _mm_div_ps(vTemp, W); + + vTemp = _mm_movelh_ps(V1, V2); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += sizeof(XMFLOAT2) * 2; + + i += 2; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + XMVECTOR V1 = _mm_div_ps(vTemp, W); + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = XM_FMADD_PS(Y, row1, row3); + vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + XMVECTOR V2 = _mm_div_ps(vTemp, W); + + vTemp = _mm_movelh_ps(V1, V2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += sizeof(XMFLOAT2) * 2; + + i += 2; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = XM_FMADD_PS(Y, row1, row3); + vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + + i += 2; + } + } + } + } + + if (!(reinterpret_cast(pInputVector) & 0xF) && !(InputStride & 0xF)) + { + // Aligned input + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2TransformNormal +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Y, M.r[1]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + float32x4_t Result = vmulq_lane_f32(M.r[1], VL, 1); // Y + return vmlaq_lane_f32(Result, M.r[0], VL, 0); // X +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y + vResult = _mm_mul_ps(vResult, M.r[1]); + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X + vResult = XM_FMADD_PS(vTemp, M.r[0], vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream +( + XMFLOAT2* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); + + assert(OutputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2(reinterpret_cast(pInputVector)); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Y, row1); + Result = XMVectorMultiplyAdd(X, row0, Result); + + #ifdef _PREFAST_ + #pragma prefast(push) + #pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) + #endif + + XMStoreFloat2(reinterpret_cast(pOutputVector), Result); + + #ifdef _PREFAST_ + #pragma prefast(pop) + #endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x2_t V = vld2q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0); // Ax + XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1); // Bx + + XM_PREFETCH(pInputVector); + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + V.val[0] = vResult0; + V.val[1] = vResult1; + + vst2q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t V = vld1_f32(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vResult = vmulq_lane_f32(row0, V, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, V, 1); // Y + + V = vget_low_f32(vResult); + vst1_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_AVX2_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + __m256 row0 = _mm256_broadcast_ps(&M.r[0]); + __m256 row1 = _mm256_broadcast_ps(&M.r[1]); + + if (InputStride == sizeof(XMFLOAT2)) + { + if (OutputStride == sizeof(XMFLOAT2)) + { + if (!(reinterpret_cast(pOutputStream) & 0x1F)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempA = _mm256_mul_ps(Y1, row1); + __m256 vTempB = _mm256_mul_ps(Y2, row1); + vTempA = _mm256_fmadd_ps(X1, row0, vTempA); + vTempB = _mm256_fmadd_ps(X2, row0, vTempB); + + X1 = _mm256_shuffle_ps(vTempA, vTempB, 0x44); + XM256_STREAM_PS(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempA = _mm256_mul_ps(Y1, row1); + __m256 vTempB = _mm256_mul_ps(Y2, row1); + vTempA = _mm256_fmadd_ps(X1, row0, vTempA); + vTempB = _mm256_fmadd_ps(X2, row0, vTempB); + + X1 = _mm256_shuffle_ps(vTempA, vTempB, 0x44); + _mm256_storeu_ps(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempA = _mm256_mul_ps(Y1, row1); + __m256 vTempB = _mm256_mul_ps(Y2, row1); + vTempA = _mm256_fmadd_ps(X1, row0, vTempA); + vTempB = _mm256_fmadd_ps(X2, row0, vTempB); + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_castps256_ps128(vTempA))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_castps256_ps128(vTempB))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_extractf128_ps(vTempA, 1))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_extractf128_ps(vTempB, 1))); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + if (i < VectorCount) + { + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + vTemp = XM_FMADD_PS(X, row0, vTemp); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + size_t i = 0; + size_t two = VectorCount >> 1; + if (two > 0) + { + if (InputStride == sizeof(XMFLOAT2)) + { + if (OutputStride == sizeof(XMFLOAT2)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + XMVECTOR V1 = XM_FMADD_PS(X, row0, vTemp); + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = _mm_mul_ps(Y, row1); + XMVECTOR V2 = XM_FMADD_PS(X, row0, vTemp); + + vTemp = _mm_movelh_ps(V1, V2); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += sizeof(XMFLOAT2) * 2; + + i += 2; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + XMVECTOR V1 = XM_FMADD_PS(X, row0, vTemp); + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = _mm_mul_ps(Y, row1); + XMVECTOR V2 = XM_FMADD_PS(X, row0, vTemp); + + vTemp = _mm_movelh_ps(V1, V2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += sizeof(XMFLOAT2) * 2; + + i += 2; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + vTemp = XM_FMADD_PS(X, row0, vTemp); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = _mm_mul_ps(Y, row1); + vTemp = XM_FMADD_PS(X, row0, vTemp); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + + i += 2; + } + } + } + } + + if (!(reinterpret_cast(pInputVector) & 0xF) && !(InputStride & 0xF)) + { + // Aligned input + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + vTemp = XM_FMADD_PS(X, row0, vTemp); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + vTemp = XM_FMADD_PS(X, row0, vTemp); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +/**************************************************************************** + * + * 3D Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1]) && + (V1.vector4_f32[2] == V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1]) && + (V1.vector4_f32[2] != V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if (r == 0xFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp) & 7; + uint32_t CR = 0; + if (iTest == 7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && + (V1.vector4_u32[1] == V2.vector4_u32[1]) && + (V1.vector4_u32[2] == V2.vector4_u32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && + (V1.vector4_u32[1] != V2.vector4_u32[1]) && + (V1.vector4_u32[2] != V2.vector4_u32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if (r == 0xFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7; + uint32_t CR = 0; + if (iTemp == 7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTemp) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float dx, dy, dz; + + dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]); + dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]); + dz = fabsf(V1.vector4_f32[2] - V2.vector4_f32[2]); + return (((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1]) && + (dz <= Epsilon.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vDelta = vsubq_f32(V1, V2); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + uint32x4_t vResult = vacleq_f32(vDelta, Epsilon); +#else + uint32x4_t vResult = vcleq_f32(vabsq_f32(vDelta), Epsilon); +#endif + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1, V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp, vDelta); + vTemp = _mm_max_ps(vTemp, vDelta); + vTemp = _mm_cmple_ps(vTemp, Epsilon); + // w is don't care + return (((_mm_movemask_ps(vTemp) & 7) == 0x7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) != 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7) != 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && + (V1.vector4_f32[1] > V2.vector4_f32[1]) && + (V1.vector4_f32[2] > V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && + (V1.vector4_f32[1] <= V2.vector4_f32[1]) && + (V1.vector4_f32[2] <= V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if (r == 0xFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp) & 7; + if (iTest == 7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1]) && + (V1.vector4_f32[2] >= V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1]) && + (V1.vector4_f32[2] < V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if (r == 0xFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp) & 7; + if (iTest == 7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcltq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcleq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t ivTemp1 = vcleq_f32(V, Bounds); + // Negate the bounds + float32x4_t vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + uint32x4_t ivTemp2 = vcleq_f32(vTemp2, V); + // Blend answers + ivTemp1 = vandq_u32(ivTemp1, ivTemp2); + // in bounds? + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(ivTemp1)), vget_high_u8(vreinterpretq_u8_u32(ivTemp1))); + uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2, V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + // x,y and z in bounds? (w is don't care) + return (((_mm_movemask_ps(vTemp1) & 0x7) == 0x7) != 0); +#else + return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds)); +#endif +} + +//------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline bool XM_CALLCONV XMVector3IsNaN(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1]) || + XMISNAN(V.vector4_f32[2])); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(__clang__) && defined(__FINITE_MATH_ONLY__) + return isnan(vgetq_lane_f32(V, 0)) || isnan(vgetq_lane_f32(V, 1)) || isnan(vgetq_lane_f32(V, 2)); +#else +// Test against itself. NaN is always not equal + uint32x4_t vTempNan = vceqq_f32(V, V); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempNan)), vget_high_u8(vreinterpretq_u8_u32(vTempNan))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + // If x or y or z are NaN, the mask is zero + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0xFFFFFFU); +#endif +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(__clang__) && defined(__FINITE_MATH_ONLY__) + XM_ALIGNED_DATA(16) float tmp[4]; + _mm_store_ps(tmp, V); + return isnan(tmp[0]) || isnan(tmp[1]) || isnan(tmp[2]); +#else +// Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V, V); + // If x or y or z are NaN, the mask is non-zero + return ((_mm_movemask_ps(vTempNan) & 7) != 0); +#endif +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3IsInfinite(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1]) || + XMISINF(V.vector4_f32[2])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x4_t vTempInf = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + // Compare to infinity + vTempInf = vceqq_f32(vreinterpretq_f32_u32(vTempInf), g_XMInfinity); + // If any are infinity, the signs are true. + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempInf)), vget_high_u8(vreinterpretq_u8_u32(vTempInf))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V, g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity); + // If x,y or z are infinity, the signs are true. + return ((_mm_movemask_ps(vTemp) & 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2]; + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = fValue; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vTemp = vmulq_f32(V1, V2); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + return vcombine_f32(v1, v1); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_dp_ps(V1, V2, 0x7f); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vTemp = _mm_mul_ps(V1, V2); + vTemp = _mm_and_ps(vTemp, g_XMMask3); + vTemp = _mm_hadd_ps(vTemp, vTemp); + return _mm_hadd_ps(vTemp, vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V1, V2); + // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2] + XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1)); + // Result.vector4_f32[0] = x+y + vDot = _mm_add_ss(vDot, vTemp); + // x=Dot.vector4_f32[2] + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // Result.vector4_f32[0] = (x+y)+z + vDot = _mm_add_ss(vDot, vTemp); + // Splat x + return XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0)); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Cross +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ] + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]), + (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]), + (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]), + 0.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t v1xy = vget_low_f32(V1); + float32x2_t v2xy = vget_low_f32(V2); + + float32x2_t v1yx = vrev64_f32(v1xy); + float32x2_t v2yx = vrev64_f32(v2xy); + + float32x2_t v1zz = vdup_lane_f32(vget_high_f32(V1), 0); + float32x2_t v2zz = vdup_lane_f32(vget_high_f32(V2), 0); + + XMVECTOR vResult = vmulq_f32(vcombine_f32(v1yx, v1xy), vcombine_f32(v2zz, v2yx)); + vResult = vmlsq_f32(vResult, vcombine_f32(v1zz, v1yx), vcombine_f32(v2yx, v2xy)); + vResult = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(vResult), g_XMFlipY)); + return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vResult), g_XMMask3)); +#elif defined(_XM_SSE_INTRINSICS_) + // y1,z1,x1,w1 + XMVECTOR vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(3, 0, 2, 1)); + // z2,x2,y2,w2 + XMVECTOR vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(3, 1, 0, 2)); + // Perform the left operation + XMVECTOR vResult = _mm_mul_ps(vTemp1, vTemp2); + // z1,x1,y1,w1 + vTemp1 = XM_PERMUTE_PS(vTemp1, _MM_SHUFFLE(3, 0, 2, 1)); + // y2,z2,x2,w2 + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(3, 1, 0, 2)); + // Perform the right operation + vResult = XM_FNMADD_PS(vTemp1, vTemp2, vResult); + // Set w to zero + return _mm_and_ps(vResult, g_XMMask3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3LengthSq(FXMVECTOR V) noexcept +{ + return XMVector3Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32(v1); + return vcombine_f32(v2, v2); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f); + return _mm_rsqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the reciprocal + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + // Reciprocal sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f); + XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp); + return _mm_div_ps(g_XMOne, vLengthSq); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V, V); + vDot = _mm_and_ps(vDot, g_XMMask3); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_sqrt_ps(vDot); + vDot = _mm_div_ps(g_XMOne, vDot); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V, V); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot, vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot, vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the reciprocal + vDot = _mm_sqrt_ps(vDot); + // Get the reciprocal + vDot = _mm_div_ps(g_XMOne, vDot); + return vDot; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3LengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(v1, zero); + // Sqrt (estimate) + float32x2_t Result = vrsqrte_f32(v1); + Result = vmul_f32(v1, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Length(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(v1, zero); + // Sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + Result = vmul_f32(v1, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ +// XMVector3NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector3ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32(v1); + // Normalize + return vmulq_f32(V, vcombine_f32(v2, v2)); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f); + XMVECTOR vResult = _mm_rsqrt_ps(vTemp); + return _mm_mul_ps(vResult, V); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V, V); + vDot = _mm_and_ps(vDot, g_XMMask3); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_rsqrt_ps(vDot); + vDot = _mm_mul_ps(vDot, V); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V, V); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot, vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot, vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the reciprocal + vDot = _mm_rsqrt_ps(vDot); + // Perform the normalization + vDot = _mm_mul_ps(vDot, V); + return vDot; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Normalize(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float fLength; + XMVECTOR vResult; + + vResult = XMVector3Length(V); + fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) + { + fLength = 1.0f / fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0] * fLength; + vResult.vector4_f32[1] = V.vector4_f32[1] * fLength; + vResult.vector4_f32[2] = V.vector4_f32[2] * fLength; + vResult.vector4_f32[3] = V.vector4_f32[3] * fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + uint32x2_t VEqualsZero = vceq_f32(v1, vdup_n_f32(0)); + uint32x2_t VEqualsInf = vceq_f32(v1, vget_low_f32(g_XMInfinity)); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + v2 = vmul_f32(S1, R1); + // Normalize + XMVECTOR vResult = vmulq_f32(V, vcombine_f32(v2, v2)); + vResult = vbslq_f32(vcombine_u32(VEqualsZero, VEqualsZero), vdupq_n_f32(0), vResult); + return vbslq_f32(vcombine_u32(VEqualsInf, VEqualsInf), g_XMQNaN, vResult); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0x7f); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE3_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) noexcept +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + + return XMVector3ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) noexcept +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero())); + assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero())); + assert(XMVector3GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector3LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) noexcept +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result = XMVector3Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) noexcept +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector3RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) noexcept +{ + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); + R = XMVectorMultiply(R, RefractionIndex); + R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); + + if (XMVector4LessOrEqual(R, Zero)) + { + // Total internal reflection + return Zero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = XMVectorSqrt(R); + R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); + + // Result = RefractionIndex * Incident - Normal * R + XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident); + Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); + + return Result; + } + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float32x4_t R = vmlsq_f32(g_XMOne, IDotN, IDotN); + R = vmulq_f32(R, RefractionIndex); + R = vmlsq_f32(g_XMOne, R, RefractionIndex); + + uint32x4_t isrzero = vcleq_f32(R, g_XMZero); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(isrzero)), vget_high_u8(vreinterpretq_u8_u32(isrzero))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + + float32x4_t vResult; + if (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // Sqrt(R) + float32x4_t S0 = vrsqrteq_f32(R); + float32x4_t P0 = vmulq_f32(R, S0); + float32x4_t R0 = vrsqrtsq_f32(P0, S0); + float32x4_t S1 = vmulq_f32(S0, R0); + float32x4_t P1 = vmulq_f32(R, S1); + float32x4_t R1 = vrsqrtsq_f32(P1, S1); + float32x4_t S2 = vmulq_f32(S1, R1); + R = vmulq_f32(R, S2); + // R = RefractionIndex * IDotN + sqrt(R) + R = vmlaq_f32(R, RefractionIndex, IDotN); + // Result = RefractionIndex * Incident - Normal * R + vResult = vmulq_f32(RefractionIndex, Incident); + vResult = vmlsq_f32(vResult, R, Normal); + } + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = XM_FNMADD_PS(IDotN, IDotN, g_XMOne); + XMVECTOR R2 = _mm_mul_ps(RefractionIndex, RefractionIndex); + R = XM_FNMADD_PS(R, R2, g_XMOne); + + XMVECTOR vResult = _mm_cmple_ps(R, g_XMZero); + if (_mm_movemask_ps(vResult) == 0x0f) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = _mm_sqrt_ps(R); + R = XM_FMADD_PS(RefractionIndex, IDotN, R); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex, Incident); + vResult = XM_FNMADD_PS(R, Normal, vResult); + } + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Orthogonal(FXMVECTOR V) noexcept +{ + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR YZYY = XMVectorSwizzle(V); + + XMVECTOR NegativeV = XMVectorSubtract(Zero, V); + + XMVECTOR ZIsNegative = XMVectorLess(Z, Zero); + XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero); + + XMVECTOR S = XMVectorAdd(YZYY, Z); + XMVECTOR D = XMVectorSubtract(YZYY, Z); + + XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative); + + XMVECTOR R0 = XMVectorPermute(NegativeV, S); + XMVECTOR R1 = XMVectorPermute(V, D); + + return XMVectorSelect(R1, R0, Select); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector3Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector3Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACos(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + XMVECTOR L1 = XMVector3ReciprocalLength(V1); + XMVECTOR L2 = XMVector3ReciprocalLength(V2); + + XMVECTOR Dot = XMVector3Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3LinePointDistance +( + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2, + FXMVECTOR Point +) noexcept +{ + // Given a vector PointVector from LinePoint1 to Point and a vector + // LineVector from LinePoint1 to LinePoint2, the scaled distance + // PointProjectionScale from LinePoint1 to the perpendicular projection + // of PointVector onto the line is defined as: + // + // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) + + XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); + XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); + + XMVECTOR LengthSq = XMVector3LengthSq(LineVector); + + XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector); + PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); + + XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); + DistanceVector = XMVectorSubtract(PointVector, DistanceVector); + + return XMVector3Length(DistanceVector); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XM_CALLCONV XMVector3ComponentsFromNormal +( + XMVECTOR* pParallel, + XMVECTOR* pPerpendicular, + FXMVECTOR V, + FXMVECTOR Normal +) noexcept +{ + assert(pParallel != nullptr); + assert(pPerpendicular != nullptr); + + XMVECTOR Scale = XMVector3Dot(V, Normal); + + XMVECTOR Parallel = XMVectorMultiply(Normal, Scale); + + *pParallel = Parallel; + *pPerpendicular = XMVectorSubtract(V, Parallel); +} + +//------------------------------------------------------------------------------ +// Transform a vector using a rotation expressed as a unit quaternion + +inline XMVECTOR XM_CALLCONV XMVector3Rotate +( + FXMVECTOR V, + FXMVECTOR RotationQuaternion +) noexcept +{ + XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); + XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); + XMVECTOR Result = XMQuaternionMultiply(Q, A); + return XMQuaternionMultiply(Result, RotationQuaternion); +} + +//------------------------------------------------------------------------------ +// Transform a vector using the inverse of a rotation expressed as a unit quaternion + +inline XMVECTOR XM_CALLCONV XMVector3InverseRotate +( + FXMVECTOR V, + FXMVECTOR RotationQuaternion +) noexcept +{ + XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); + XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A); + XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); + return XMQuaternionMultiply(Result, Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Transform +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + XMVECTOR vResult = vmlaq_lane_f32(M.r[3], M.r[0], VL, 0); // X + vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1); // Y + return vmlaq_lane_f32(vResult, M.r[2], vget_high_f32(V), 0); // Z +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z + vResult = XM_FMADD_PS(vResult, M.r[2], M.r[3]); + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y + vResult = XM_FMADD_PS(vTemp, M.r[1], vResult); + vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X + vResult = XM_FMADD_PS(vTemp, M.r[0], vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMVector3TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMStoreFloat4(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT4))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT3) * 4; + + float32x2_t r3 = vget_low_f32(row3); + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(row3); + r = vget_high_f32(row0); + XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O + XMVECTOR vResult3 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O + vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + r = vget_low_f32(row2); + vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(row2); + vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz+O + vResult3 = vmlaq_lane_f32(vResult3, V.val[2], r, 1); // Dx+Hy+Lz+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + + float32x4x4_t R; + R.val[0] = vResult0; + R.val[1] = vResult1; + R.val[2] = vResult2; + R.val[3] = vResult3; + + vst4q_f32(reinterpret_cast(pOutputVector), R); + pOutputVector += sizeof(XMFLOAT4) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32(reinterpret_cast(pInputVector)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32(reinterpret_cast(pInputVector) + 2, zero, 0); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32(row3, row0, VL, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y + vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z + + vst1q_f32(reinterpret_cast(pOutputVector), vResult); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) + { + // Packed input, aligned output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + else + { + // Packed input, unaligned output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) + { + // Aligned output + for (; i < VectorCount; ++i) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned output + for (; i < VectorCount; ++i) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3TransformCoord +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMVECTOR W = XMVectorSplatW(Result); + return XMVectorDivide(Result, W); +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMVECTOR W = XMVectorSplatW(Result); + + Result = XMVectorDivide(Result, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT3) * 4; + + float32x2_t r3 = vget_low_f32(row3); + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(row3); + r = vget_high_f32(row0); + XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O + XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O + W = vmlaq_lane_f32(W, V.val[1], r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + r = vget_low_f32(row2); + vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(row2); + vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz+O + W = vmlaq_lane_f32(W, V.val[2], r, 1); // Dx+Hy+Lz+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + + #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + V.val[0] = vdivq_f32(vResult0, W); + V.val[1] = vdivq_f32(vResult1, W); + V.val[2] = vdivq_f32(vResult2, W); + #else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + V.val[0] = vmulq_f32(vResult0, Reciprocal); + V.val[1] = vmulq_f32(vResult1, Reciprocal); + V.val[2] = vmulq_f32(vResult2, Reciprocal); + #endif + + vst3q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT3) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32(reinterpret_cast(pInputVector)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32(reinterpret_cast(pInputVector) + 2, zero, 0); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32(row3, row0, VL, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y + vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z + + VH = vget_high_f32(vResult); + XMVECTOR W = vdupq_lane_f32(VH, 1); + + #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + vResult = vdivq_f32(vResult, W); + #else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + vResult = vmulq_f32(vResult, Reciprocal); + #endif + + VL = vget_low_f32(vResult); + vst1_f32(reinterpret_cast(pOutputVector), VL); + vst1q_lane_f32(reinterpret_cast(pOutputVector) + 2, vResult, 2); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V1 = _mm_div_ps(vTemp, W); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V2 = _mm_div_ps(vTemp, W); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V3 = _mm_div_ps(vTemp, W); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V4 = _mm_div_ps(vTemp, W); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector), V1); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 16), vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V1 = _mm_div_ps(vTemp, W); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V2 = _mm_div_ps(vTemp, W); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V3 = _mm_div_ps(vTemp, W); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V4 = _mm_div_ps(vTemp, W); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector), V1); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 16), vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3TransformNormal +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Z, M.r[2]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + XMVECTOR vResult = vmulq_lane_f32(M.r[0], VL, 0); // X + vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1); // Y + return vmlaq_lane_f32(vResult, M.r[2], vget_high_f32(V), 0); // Z +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z + vResult = _mm_mul_ps(vResult, M.r[2]); + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y + vResult = XM_FMADD_PS(vTemp, M.r[1], vResult); + vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X + vResult = XM_FMADD_PS(vTemp, M.r[0], vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Z, row2); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT3) * 4; + + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0); // Ax + XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1); // Bx + + XM_PREFETCH(pInputVector); + + r = vget_high_f32(row0); + XMVECTOR vResult2 = vmulq_lane_f32(V.val[0], r, 0); // Cx + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + r = vget_low_f32(row2); + vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz + vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(row2); + vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + + V.val[0] = vResult0; + V.val[1] = vResult1; + V.val[2] = vResult2; + + vst3q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT3) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32(reinterpret_cast(pInputVector)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32(reinterpret_cast(pInputVector) + 2, zero, 0); + pInputVector += InputStride; + + XMVECTOR vResult = vmulq_lane_f32(row0, VL, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y + vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z + + VL = vget_low_f32(vResult); + vst1_f32(reinterpret_cast(pOutputVector), VL); + vst1q_lane_f32(reinterpret_cast(pOutputVector) + 2, vResult, 2); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Z, row2); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V1 = _mm_add_ps(vTemp, vTemp3); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V2 = _mm_add_ps(vTemp, vTemp3); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V3 = _mm_add_ps(vTemp, vTemp3); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V4 = _mm_add_ps(vTemp, vTemp3); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector), V1); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 16), vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Z, row2); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V1 = _mm_add_ps(vTemp, vTemp3); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V2 = _mm_add_ps(vTemp, vTemp3); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V3 = _mm_add_ps(vTemp, vTemp3); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V4 = _mm_add_ps(vTemp, vTemp3); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector), V1); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 16), vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Z, row2); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Z, row2); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Project +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) noexcept +{ + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + XMVECTOR Result = XMVector3TransformCoord(V, Transform); + + Result = XMVectorMultiplyAdd(Result, Scale, Offset); + + return Result; +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + + XMVECTOR Result = XMVector3TransformCoord(V, Transform); + Result = XMVectorMultiplyAdd(Result, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + XMVECTOR ScaleX = vdupq_n_f32(HalfViewportWidth); + XMVECTOR ScaleY = vdupq_n_f32(-HalfViewportHeight); + XMVECTOR ScaleZ = vdupq_n_f32(ViewportMaxZ - ViewportMinZ); + + XMVECTOR OffsetX = vdupq_n_f32(ViewportX + HalfViewportWidth); + XMVECTOR OffsetY = vdupq_n_f32(ViewportY + HalfViewportHeight); + XMVECTOR OffsetZ = vdupq_n_f32(ViewportMinZ); + + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT3) * 4; + + float32x2_t r3 = vget_low_f32(Transform.r[3]); + float32x2_t r = vget_low_f32(Transform.r[0]); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(Transform.r[3]); + r = vget_high_f32(Transform.r[0]); + XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O + XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(Transform.r[1]); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(Transform.r[1]); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O + W = vmlaq_lane_f32(W, V.val[1], r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + r = vget_low_f32(Transform.r[2]); + vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(Transform.r[2]); + vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz+O + W = vmlaq_lane_f32(W, V.val[2], r, 1); // Dx+Hy+Lz+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + + #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + vResult0 = vdivq_f32(vResult0, W); + vResult1 = vdivq_f32(vResult1, W); + vResult2 = vdivq_f32(vResult2, W); + #else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + vResult0 = vmulq_f32(vResult0, Reciprocal); + vResult1 = vmulq_f32(vResult1, Reciprocal); + vResult2 = vmulq_f32(vResult2, Reciprocal); + #endif + + V.val[0] = vmlaq_f32(OffsetX, vResult0, ScaleX); + V.val[1] = vmlaq_f32(OffsetY, vResult1, ScaleY); + V.val[2] = vmlaq_f32(OffsetZ, vResult2, ScaleZ); + + vst3q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT3) * 4; + + i += 4; + } + } + } + + if (i < VectorCount) + { + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32(reinterpret_cast(pInputVector)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32(reinterpret_cast(pInputVector) + 2, zero, 0); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32(Transform.r[3], Transform.r[0], VL, 0); // X + vResult = vmlaq_lane_f32(vResult, Transform.r[1], VL, 1); // Y + vResult = vmlaq_lane_f32(vResult, Transform.r[2], VH, 0); // Z + + VH = vget_high_f32(vResult); + XMVECTOR W = vdupq_lane_f32(VH, 1); + + #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + vResult = vdivq_f32(vResult, W); + #else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + vResult = vmulq_f32(vResult, Reciprocal); + #endif + + vResult = vmlaq_f32(Offset, vResult, Scale); + + VL = vget_low_f32(vResult); + vst1_f32(reinterpret_cast(pOutputVector), VL); + vst1q_lane_f32(reinterpret_cast(pOutputVector) + 2, vResult, 2); + pOutputVector += OutputStride; + } + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V1 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V2 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V3 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V4 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector), V1); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 16), vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V1 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V2 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V3 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V4 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector), V1); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 16), vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + vTemp = XM_FMADD_PS(vTemp, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + vTemp = XM_FMADD_PS(vTemp, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + vTemp = XM_FMADD_PS(vTemp, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + vTemp = XM_FMADD_PS(vTemp, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + vTemp = XM_FMADD_PS(vTemp, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Unproject +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) noexcept +{ + static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); + + return XMVector3TransformCoord(Result, Transform); +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + + XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); + + Result = XMVector3TransformCoord(Result, Transform); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + float sx = 1.f / (ViewportWidth * 0.5f); + float sy = 1.f / (-ViewportHeight * 0.5f); + float sz = 1.f / (ViewportMaxZ - ViewportMinZ); + + float ox = (-ViewportX * sx) - 1.f; + float oy = (-ViewportY * sy) + 1.f; + float oz = (-ViewportMinZ * sz); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT3) * 4; + + XMVECTOR ScaleX = vdupq_n_f32(sx); + XMVECTOR OffsetX = vdupq_n_f32(ox); + XMVECTOR VX = vmlaq_f32(OffsetX, ScaleX, V.val[0]); + + float32x2_t r3 = vget_low_f32(Transform.r[3]); + float32x2_t r = vget_low_f32(Transform.r[0]); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), VX, r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), VX, r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(Transform.r[3]); + r = vget_high_f32(Transform.r[0]); + XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), VX, r, 0); // Cx+O + XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), VX, r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + XMVECTOR ScaleY = vdupq_n_f32(sy); + XMVECTOR OffsetY = vdupq_n_f32(oy); + XMVECTOR VY = vmlaq_f32(OffsetY, ScaleY, V.val[1]); + + r = vget_low_f32(Transform.r[1]); + vResult0 = vmlaq_lane_f32(vResult0, VY, r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, VY, r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(Transform.r[1]); + vResult2 = vmlaq_lane_f32(vResult2, VY, r, 0); // Cx+Gy+O + W = vmlaq_lane_f32(W, VY, r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + XMVECTOR ScaleZ = vdupq_n_f32(sz); + XMVECTOR OffsetZ = vdupq_n_f32(oz); + XMVECTOR VZ = vmlaq_f32(OffsetZ, ScaleZ, V.val[2]); + + r = vget_low_f32(Transform.r[2]); + vResult0 = vmlaq_lane_f32(vResult0, VZ, r, 0); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32(vResult1, VZ, r, 1); // Bx+Fy+Jz+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(Transform.r[2]); + vResult2 = vmlaq_lane_f32(vResult2, VZ, r, 0); // Cx+Gy+Kz+O + W = vmlaq_lane_f32(W, VZ, r, 1); // Dx+Hy+Lz+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + + #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + V.val[0] = vdivq_f32(vResult0, W); + V.val[1] = vdivq_f32(vResult1, W); + V.val[2] = vdivq_f32(vResult2, W); + #else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + V.val[0] = vmulq_f32(vResult0, Reciprocal); + V.val[1] = vmulq_f32(vResult1, Reciprocal); + V.val[2] = vmulq_f32(vResult2, Reciprocal); + #endif + + vst3q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT3) * 4; + + i += 4; + } + } + } + + if (i < VectorCount) + { + float32x2_t ScaleL = vcreate_f32( + static_cast(*reinterpret_cast(&sx)) + | (static_cast(*reinterpret_cast(&sy)) << 32)); + float32x2_t ScaleH = vcreate_f32(static_cast(*reinterpret_cast(&sz))); + + float32x2_t OffsetL = vcreate_f32( + static_cast(*reinterpret_cast(&ox)) + | (static_cast(*reinterpret_cast(&oy)) << 32)); + float32x2_t OffsetH = vcreate_f32(static_cast(*reinterpret_cast(&oz))); + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32(reinterpret_cast(pInputVector)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32(reinterpret_cast(pInputVector) + 2, zero, 0); + pInputVector += InputStride; + + VL = vmla_f32(OffsetL, VL, ScaleL); + VH = vmla_f32(OffsetH, VH, ScaleH); + + XMVECTOR vResult = vmlaq_lane_f32(Transform.r[3], Transform.r[0], VL, 0); // X + vResult = vmlaq_lane_f32(vResult, Transform.r[1], VL, 1); // Y + vResult = vmlaq_lane_f32(vResult, Transform.r[2], VH, 0); // Z + + VH = vget_high_f32(vResult); + XMVECTOR W = vdupq_lane_f32(VH, 1); + + #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + vResult = vdivq_f32(vResult, W); + #else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + vResult = vmulq_f32(vResult, Reciprocal); + #endif + + VL = vget_low_f32(vResult); + vst1_f32(reinterpret_cast(pOutputVector), VL); + vst1q_lane_f32(reinterpret_cast(pOutputVector) + 2, vResult, 2); + pOutputVector += OutputStride; + } + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = _mm_mul_ps(Scale, Offset); + Offset = _mm_add_ps(Offset, D); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + V1 = XM_FMADD_PS(V1, Scale, Offset); + + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V1 = _mm_div_ps(vTemp, W); + + // Result 2 + V2 = XM_FMADD_PS(V2, Scale, Offset); + + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V2 = _mm_div_ps(vTemp, W); + + // Result 3 + V3 = XM_FMADD_PS(V3, Scale, Offset); + + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V3 = _mm_div_ps(vTemp, W); + + // Result 4 + V4 = XM_FMADD_PS(V4, Scale, Offset); + + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V4 = _mm_div_ps(vTemp, W); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector), V1); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 16), vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + V1 = XM_FMADD_PS(V1, Scale, Offset); + + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V1 = _mm_div_ps(vTemp, W); + + // Result 2 + V2 = XM_FMADD_PS(V2, Scale, Offset); + + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V2 = _mm_div_ps(vTemp, W); + + // Result 3 + V3 = XM_FMADD_PS(V3, Scale, Offset); + + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V3 = _mm_div_ps(vTemp, W); + + // Result 4 + V4 = XM_FMADD_PS(V4, Scale, Offset); + + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V4 = _mm_div_ps(vTemp, W); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector), V1); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 16), vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + V1 = XM_FMADD_PS(V1, Scale, Offset); + + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + V2 = XM_FMADD_PS(V2, Scale, Offset); + + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + V3 = XM_FMADD_PS(V3, Scale, Offset); + + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + V4 = XM_FMADD_PS(V4, Scale, Offset); + + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + V = _mm_mul_ps(V, Scale); + V = _mm_add_ps(V, Offset); + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +/**************************************************************************** + * + * 4D Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + return ((_mm_movemask_ps(vTemp) == 0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1]) && + (V1.vector4_f32[2] == V2.vector4_f32[2]) && + (V1.vector4_f32[3] == V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1]) && + (V1.vector4_f32[2] != V2.vector4_f32[2]) && + (V1.vector4_f32[3] != V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp); + uint32_t CR = 0; + if (iTest == 0xf) // All equal? + { + CR = XM_CRMASK_CR6TRUE; + } + else if (iTest == 0) // All not equal? + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) == 0xf) != 0); +#else + return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if (V1.vector4_u32[0] == V2.vector4_u32[0] && + V1.vector4_u32[1] == V2.vector4_u32[1] && + V1.vector4_u32[2] == V2.vector4_u32[2] && + V1.vector4_u32[3] == V2.vector4_u32[3]) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (V1.vector4_u32[0] != V2.vector4_u32[0] && + V1.vector4_u32[1] != V2.vector4_u32[1] && + V1.vector4_u32[2] != V2.vector4_u32[2] && + V1.vector4_u32[3] != V2.vector4_u32[3]) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp)); + uint32_t CR = 0; + if (iTest == 0xf) // All equal? + { + CR = XM_CRMASK_CR6TRUE; + } + else if (iTest == 0) // All not equal? + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +inline bool XM_CALLCONV XMVector4NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float dx, dy, dz, dw; + + dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]); + dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]); + dz = fabsf(V1.vector4_f32[2] - V2.vector4_f32[2]); + dw = fabsf(V1.vector4_f32[3] - V2.vector4_f32[3]); + return (((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1]) && + (dz <= Epsilon.vector4_f32[2]) && + (dw <= Epsilon.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vDelta = vsubq_f32(V1, V2); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + uint32x4_t vResult = vacleq_f32(vDelta, Epsilon); +#else + uint32x4_t vResult = vcleq_f32(vabsq_f32(vDelta), Epsilon); +#endif + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1, V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp, vDelta); + vTemp = _mm_max_ps(vTemp, vDelta); + vTemp = _mm_cmple_ps(vTemp, Epsilon); + return ((_mm_movemask_ps(vTemp) == 0xf) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpneq_ps(V1, V2); + return ((_mm_movemask_ps(vTemp)) != 0); +#else + return XMComparisonAnyFalse(XMVector4EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) != 0xF) != 0); +#else + return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + return ((_mm_movemask_ps(vTemp) == 0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if (V1.vector4_f32[0] > V2.vector4_f32[0] && + V1.vector4_f32[1] > V2.vector4_f32[1] && + V1.vector4_f32[2] > V2.vector4_f32[2] && + V1.vector4_f32[3] > V2.vector4_f32[3]) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (V1.vector4_f32[0] <= V2.vector4_f32[0] && + V1.vector4_f32[1] <= V2.vector4_f32[1] && + V1.vector4_f32[2] <= V2.vector4_f32[2] && + V1.vector4_f32[3] <= V2.vector4_f32[3]) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + uint32_t CR = 0; + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp); + if (iTest == 0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + return ((_mm_movemask_ps(vTemp) == 0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1]) && + (V1.vector4_f32[2] >= V2.vector4_f32[2]) && + (V1.vector4_f32[3] >= V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1]) && + (V1.vector4_f32[2] < V2.vector4_f32[2]) && + (V1.vector4_f32[3] < V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + uint32_t CR = 0; + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp); + if (iTest == 0x0f) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcltq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1, V2); + return ((_mm_movemask_ps(vTemp) == 0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcleq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1, V2); + return ((_mm_movemask_ps(vTemp) == 0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) && + (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t ivTemp1 = vcleq_f32(V, Bounds); + // Negate the bounds + float32x4_t vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + uint32x4_t ivTemp2 = vcleq_f32(vTemp2, V); + // Blend answers + ivTemp1 = vandq_u32(ivTemp1, ivTemp2); + // in bounds? + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(ivTemp1)), vget_high_u8(vreinterpretq_u8_u32(ivTemp1))); + uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2, V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + // All in bounds? + return ((_mm_movemask_ps(vTemp1) == 0x0f) != 0); +#else + return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds)); +#endif +} + +//------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline bool XM_CALLCONV XMVector4IsNaN(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1]) || + XMISNAN(V.vector4_f32[2]) || + XMISNAN(V.vector4_f32[3])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(__clang__) && defined(__FINITE_MATH_ONLY__) + return isnan(vgetq_lane_f32(V, 0)) || isnan(vgetq_lane_f32(V, 1)) || isnan(vgetq_lane_f32(V, 2)) || isnan(vgetq_lane_f32(V, 3)); +#else +// Test against itself. NaN is always not equal + uint32x4_t vTempNan = vceqq_f32(V, V); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempNan)), vget_high_u8(vreinterpretq_u8_u32(vTempNan))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + // If any are NaN, the mask is zero + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0xFFFFFFFFU); +#endif +#elif defined(_XM_SSE_INTRINSICS_) +#if defined(__clang__) && defined(__FINITE_MATH_ONLY__) + XM_ALIGNED_DATA(16) float tmp[4]; + _mm_store_ps(tmp, V); + return isnan(tmp[0]) || isnan(tmp[1]) || isnan(tmp[2]) || isnan(tmp[3]); +#else +// Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V, V); + // If any are NaN, the mask is non-zero + return (_mm_movemask_ps(vTempNan) != 0); +#endif +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4IsInfinite(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1]) || + XMISINF(V.vector4_f32[2]) || + XMISINF(V.vector4_f32[3])); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x4_t vTempInf = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + // Compare to infinity + vTempInf = vceqq_f32(vreinterpretq_f32_u32(vTempInf), g_XMInfinity); + // If any are infinity, the signs are true. + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempInf)), vget_high_u8(vreinterpretq_u8_u32(vTempInf))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + XMVECTOR vTemp = _mm_and_ps(V, g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity); + // If any are infinity, the signs are true. + return (_mm_movemask_ps(vTemp) != 0); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result; + Result.f[0] = + Result.f[1] = + Result.f[2] = + Result.f[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3]; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vTemp = vmulq_f32(V1, V2); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + return vcombine_f32(v1, v1); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_dp_ps(V1, V2, 0xff); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vTemp = _mm_mul_ps(V1, V2); + vTemp = _mm_hadd_ps(vTemp, vTemp); + return _mm_hadd_ps(vTemp, vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp2 = V2; + XMVECTOR vTemp = _mm_mul_ps(V1, vTemp2); + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp, _MM_SHUFFLE(1, 0, 0, 0)); // Copy X to the Z position and Y to the W position + vTemp2 = _mm_add_ps(vTemp2, vTemp); // Add Z = X+Z; W = Y+W; + vTemp = _mm_shuffle_ps(vTemp, vTemp2, _MM_SHUFFLE(0, 3, 0, 0)); // Copy W to the Z position + vTemp = _mm_add_ps(vTemp, vTemp2); // Add Z and W together + return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(2, 2, 2, 2)); // Splat Z and return +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Cross +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) noexcept +{ + // [ ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w), + // ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w), + // ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w), + // ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z) ] + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + (((V2.vector4_f32[2] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[2])) * V1.vector4_f32[1]) - (((V2.vector4_f32[1] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[1])) * V1.vector4_f32[2]) + (((V2.vector4_f32[1] * V3.vector4_f32[2]) - (V2.vector4_f32[2] * V3.vector4_f32[1])) * V1.vector4_f32[3]), + (((V2.vector4_f32[3] * V3.vector4_f32[2]) - (V2.vector4_f32[2] * V3.vector4_f32[3])) * V1.vector4_f32[0]) - (((V2.vector4_f32[3] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[3])) * V1.vector4_f32[2]) + (((V2.vector4_f32[2] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[2])) * V1.vector4_f32[3]), + (((V2.vector4_f32[1] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[1])) * V1.vector4_f32[0]) - (((V2.vector4_f32[0] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[0])) * V1.vector4_f32[1]) + (((V2.vector4_f32[0] * V3.vector4_f32[1]) - (V2.vector4_f32[1] * V3.vector4_f32[0])) * V1.vector4_f32[3]), + (((V2.vector4_f32[2] * V3.vector4_f32[1]) - (V2.vector4_f32[1] * V3.vector4_f32[2])) * V1.vector4_f32[0]) - (((V2.vector4_f32[2] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[2])) * V1.vector4_f32[1]) + (((V2.vector4_f32[1] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[1])) * V1.vector4_f32[2]), + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const uint32x2_t select = vget_low_u32(g_XMMaskX); + + // Term1: V2zwyz * V3wzwy + const float32x2_t v2xy = vget_low_f32(V2); + const float32x2_t v2zw = vget_high_f32(V2); + const float32x2_t v2yx = vrev64_f32(v2xy); + const float32x2_t v2wz = vrev64_f32(v2zw); + const float32x2_t v2yz = vbsl_f32(select, v2yx, v2wz); + + const float32x2_t v3zw = vget_high_f32(V3); + const float32x2_t v3wz = vrev64_f32(v3zw); + const float32x2_t v3xy = vget_low_f32(V3); + const float32x2_t v3wy = vbsl_f32(select, v3wz, v3xy); + + float32x4_t vTemp1 = vcombine_f32(v2zw, v2yz); + float32x4_t vTemp2 = vcombine_f32(v3wz, v3wy); + XMVECTOR vResult = vmulq_f32(vTemp1, vTemp2); + + // - V2wzwy * V3zwyz + const float32x2_t v2wy = vbsl_f32(select, v2wz, v2xy); + + const float32x2_t v3yx = vrev64_f32(v3xy); + const float32x2_t v3yz = vbsl_f32(select, v3yx, v3wz); + + vTemp1 = vcombine_f32(v2wz, v2wy); + vTemp2 = vcombine_f32(v3zw, v3yz); + vResult = vmlsq_f32(vResult, vTemp1, vTemp2); + + // term1 * V1yxxx + const float32x2_t v1xy = vget_low_f32(V1); + const float32x2_t v1yx = vrev64_f32(v1xy); + + vTemp1 = vcombine_f32(v1yx, vdup_lane_f32(v1yx, 1)); + vResult = vmulq_f32(vResult, vTemp1); + + // Term2: V2ywxz * V3wxwx + const float32x2_t v2yw = vrev64_f32(v2wy); + const float32x2_t v2xz = vbsl_f32(select, v2xy, v2wz); + + const float32x2_t v3wx = vbsl_f32(select, v3wz, v3yx); + + vTemp1 = vcombine_f32(v2yw, v2xz); + vTemp2 = vcombine_f32(v3wx, v3wx); + float32x4_t vTerm = vmulq_f32(vTemp1, vTemp2); + + // - V2wxwx * V3ywxz + const float32x2_t v2wx = vbsl_f32(select, v2wz, v2yx); + + const float32x2_t v3yw = vrev64_f32(v3wy); + const float32x2_t v3xz = vbsl_f32(select, v3xy, v3wz); + + vTemp1 = vcombine_f32(v2wx, v2wx); + vTemp2 = vcombine_f32(v3yw, v3xz); + vTerm = vmlsq_f32(vTerm, vTemp1, vTemp2); + + // vResult - term2 * V1zzyy + const float32x2_t v1zw = vget_high_f32(V1); + + vTemp1 = vcombine_f32(vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0)); + vResult = vmlsq_f32(vResult, vTerm, vTemp1); + + // Term3: V2yzxy * V3zxyx + const float32x2_t v3zx = vrev64_f32(v3xz); + + vTemp1 = vcombine_f32(v2yz, v2xy); + vTemp2 = vcombine_f32(v3zx, v3yx); + vTerm = vmulq_f32(vTemp1, vTemp2); + + // - V2zxyx * V3yzxy + const float32x2_t v2zx = vrev64_f32(v2xz); + + vTemp1 = vcombine_f32(v2zx, v2yx); + vTemp2 = vcombine_f32(v3yz, v3xy); + vTerm = vmlsq_f32(vTerm, vTemp1, vTemp2); + + // vResult + term3 * V1wwwz + const float32x2_t v1wz = vrev64_f32(v1zw); + + vTemp1 = vcombine_f32(vdup_lane_f32(v1wz, 0), v1wz); + return vmlaq_f32(vResult, vTerm, vTemp1); +#elif defined(_XM_SSE_INTRINSICS_) + // V2zwyz * V3wzwy + XMVECTOR vResult = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 1, 3, 2)); + XMVECTOR vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 3, 2, 3)); + vResult = _mm_mul_ps(vResult, vTemp3); + // - V2wzwy * V3zwyz + XMVECTOR vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 3, 2, 3)); + vTemp3 = XM_PERMUTE_PS(vTemp3, _MM_SHUFFLE(1, 3, 0, 1)); + vResult = XM_FNMADD_PS(vTemp2, vTemp3, vResult); + // term1 * V1yxxx + XMVECTOR vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 1)); + vResult = _mm_mul_ps(vResult, vTemp1); + + // V2ywxz * V3wxwx + vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 0, 3, 1)); + vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 3, 0, 3)); + vTemp3 = _mm_mul_ps(vTemp3, vTemp2); + // - V2wxwx * V3ywxz + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(2, 1, 2, 1)); + vTemp1 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 0, 3, 1)); + vTemp3 = XM_FNMADD_PS(vTemp2, vTemp1, vTemp3); + // vResult - temp * V1zzyy + vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 2, 2)); + vResult = XM_FNMADD_PS(vTemp1, vTemp3, vResult); + + // V2yzxy * V3zxyx + vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 0, 2, 1)); + vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 1, 0, 2)); + vTemp3 = _mm_mul_ps(vTemp3, vTemp2); + // - V2zxyx * V3yzxy + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(2, 0, 2, 1)); + vTemp1 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 0, 2, 1)); + vTemp3 = XM_FNMADD_PS(vTemp1, vTemp2, vTemp3); + // vResult + term * V1wwwz + vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 3, 3, 3)); + vResult = XM_FMADD_PS(vTemp3, vTemp1, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4LengthSq(FXMVECTOR V) noexcept +{ + return XMVector4Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32(v1); + return vcombine_f32(v2, v2); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff); + return _mm_rsqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Get the reciprocal + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + // Reciprocal sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff); + XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp); + return _mm_div_ps(g_XMOne, vLengthSq); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + vLengthSq = _mm_div_ps(g_XMOne, vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Get the reciprocal + vLengthSq = _mm_sqrt_ps(vLengthSq); + // Accurate! + vLengthSq = _mm_div_ps(g_XMOne, vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4LengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(v1, zero); + // Sqrt (estimate) + float32x2_t Result = vrsqrte_f32(v1); + Result = vmul_f32(v1, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Length(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(v1, zero); + // Sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + Result = vmul_f32(v1, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ +// XMVector4NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector4ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32(v1); + // Normalize + return vmulq_f32(V, vcombine_f32(v2, v2)); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff); + XMVECTOR vResult = _mm_rsqrt_ps(vTemp); + return _mm_mul_ps(vResult, V); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V, V); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_rsqrt_ps(vDot); + vDot = _mm_mul_ps(vDot, V); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Get the reciprocal + XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq); + // Reciprocal mul to perform the normalization + vResult = _mm_mul_ps(vResult, V); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Normalize(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float fLength; + XMVECTOR vResult; + + vResult = XMVector4Length(V); + fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) + { + fLength = 1.0f / fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0] * fLength; + vResult.vector4_f32[1] = V.vector4_f32[1] * fLength; + vResult.vector4_f32[2] = V.vector4_f32[2] * fLength; + vResult.vector4_f32[3] = V.vector4_f32[3] * fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + uint32x2_t VEqualsZero = vceq_f32(v1, vdup_n_f32(0)); + uint32x2_t VEqualsInf = vceq_f32(v1, vget_low_f32(g_XMInfinity)); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + v2 = vmul_f32(S1, R1); + // Normalize + XMVECTOR vResult = vmulq_f32(V, vcombine_f32(v2, v2)); + vResult = vbslq_f32(vcombine_u32(VEqualsZero, VEqualsZero), vdupq_n_f32(0), vResult); + return vbslq_f32(vcombine_u32(VEqualsInf, VEqualsInf), g_XMQNaN, vResult); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0xff); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE3_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) noexcept +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + + return XMVector4ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) noexcept +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero())); + assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero())); + assert(XMVector4GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector4LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) noexcept +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result = XMVector4Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) noexcept +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector4RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR IDotN; + XMVECTOR R; + const XMVECTOR Zero = XMVectorZero(); + + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + + IDotN = XMVector4Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); + R = XMVectorMultiply(R, RefractionIndex); + R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); + + if (XMVector4LessOrEqual(R, Zero)) + { + // Total internal reflection + return Zero; + } + else + { + XMVECTOR Result; + + // R = RefractionIndex * IDotN + sqrt(R) + R = XMVectorSqrt(R); + R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); + + // Result = RefractionIndex * Incident - Normal * R + Result = XMVectorMultiply(RefractionIndex, Incident); + Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); + + return Result; + } + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR IDotN = XMVector4Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float32x4_t R = vmlsq_f32(g_XMOne, IDotN, IDotN); + R = vmulq_f32(R, RefractionIndex); + R = vmlsq_f32(g_XMOne, R, RefractionIndex); + + uint32x4_t isrzero = vcleq_f32(R, g_XMZero); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(isrzero)), vget_high_u8(vreinterpretq_u8_u32(isrzero))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + + float32x4_t vResult; + if (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // Sqrt(R) + float32x4_t S0 = vrsqrteq_f32(R); + float32x4_t P0 = vmulq_f32(R, S0); + float32x4_t R0 = vrsqrtsq_f32(P0, S0); + float32x4_t S1 = vmulq_f32(S0, R0); + float32x4_t P1 = vmulq_f32(R, S1); + float32x4_t R1 = vrsqrtsq_f32(P1, S1); + float32x4_t S2 = vmulq_f32(S1, R1); + R = vmulq_f32(R, S2); + // R = RefractionIndex * IDotN + sqrt(R) + R = vmlaq_f32(R, RefractionIndex, IDotN); + // Result = RefractionIndex * Incident - Normal * R + vResult = vmulq_f32(RefractionIndex, Incident); + vResult = vmlsq_f32(vResult, R, Normal); + } + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR IDotN = XMVector4Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = XM_FNMADD_PS(IDotN, IDotN, g_XMOne); + XMVECTOR R2 = _mm_mul_ps(RefractionIndex, RefractionIndex); + R = XM_FNMADD_PS(R, R2, g_XMOne); + + XMVECTOR vResult = _mm_cmple_ps(R, g_XMZero); + if (_mm_movemask_ps(vResult) == 0x0f) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = _mm_sqrt_ps(R); + R = XM_FMADD_PS(RefractionIndex, IDotN, R); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex, Incident); + vResult = XM_FNMADD_PS(R, Normal, vResult); + } + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Orthogonal(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + V.vector4_f32[2], + V.vector4_f32[3], + -V.vector4_f32[0], + -V.vector4_f32[1] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { { { 1.f, 1.f, -1.f, -1.f } } }; + + float32x4_t Result = vcombine_f32(vget_high_f32(V), vget_low_f32(V)); + return vmulq_f32(Result, Negate); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FlipZW = { { { 1.0f, 1.0f, -1.0f, -1.0f } } }; + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 0, 3, 2)); + vResult = _mm_mul_ps(vResult, FlipZW); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector4Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector4Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACos(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + XMVECTOR L1 = XMVector4ReciprocalLength(V1); + XMVECTOR L2 = XMVector4ReciprocalLength(V2); + + XMVECTOR Dot = XMVector4Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Transform +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float fX = (M.m[0][0] * V.vector4_f32[0]) + (M.m[1][0] * V.vector4_f32[1]) + (M.m[2][0] * V.vector4_f32[2]) + (M.m[3][0] * V.vector4_f32[3]); + float fY = (M.m[0][1] * V.vector4_f32[0]) + (M.m[1][1] * V.vector4_f32[1]) + (M.m[2][1] * V.vector4_f32[2]) + (M.m[3][1] * V.vector4_f32[3]); + float fZ = (M.m[0][2] * V.vector4_f32[0]) + (M.m[1][2] * V.vector4_f32[1]) + (M.m[2][2] * V.vector4_f32[2]) + (M.m[3][2] * V.vector4_f32[3]); + float fW = (M.m[0][3] * V.vector4_f32[0]) + (M.m[1][3] * V.vector4_f32[1]) + (M.m[2][3] * V.vector4_f32[2]) + (M.m[3][3] * V.vector4_f32[3]); + XMVECTORF32 vResult = { { { fX, fY, fZ, fW } } }; + return vResult.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + XMVECTOR vResult = vmulq_lane_f32(M.r[0], VL, 0); // X + vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1); // Y + float32x2_t VH = vget_high_f32(V); + vResult = vmlaq_lane_f32(vResult, M.r[2], VH, 0); // Z + return vmlaq_lane_f32(vResult, M.r[3], VH, 1); // W +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); // W + vResult = _mm_mul_ps(vResult, M.r[3]); + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z + vResult = XM_FMADD_PS(vTemp, M.r[2], vResult); + vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y + vResult = XM_FMADD_PS(vTemp, M.r[1], vResult); + vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X + vResult = XM_FMADD_PS(vTemp, M.r[0], vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMVector4TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT4* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT4)); + + assert(OutputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat4(reinterpret_cast(pInputVector)); + XMVECTOR W = XMVectorSplatW(V); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(W, row3); + Result = XMVectorMultiplyAdd(Z, row2, Result); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + #ifdef _PREFAST_ + #pragma prefast(push) + #pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) + #endif + + XMStoreFloat4(reinterpret_cast(pOutputVector), Result); + + #ifdef _PREFAST_ + #pragma prefast(pop) + #endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT4)) && (OutputStride == sizeof(XMFLOAT4))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x4_t V = vld4q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT4) * 4; + + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0); // Ax + XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1); // Bx + + XM_PREFETCH(pInputVector); + + r = vget_high_f32(row0); + XMVECTOR vResult2 = vmulq_lane_f32(V.val[0], r, 0); // Cx + XMVECTOR vResult3 = vmulq_lane_f32(V.val[0], r, 1); // Dx + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy + vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1); // Dx+Hy + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + r = vget_low_f32(row2); + vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz + vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(row2); + vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz + vResult3 = vmlaq_lane_f32(vResult3, V.val[2], r, 1); // Dx+Hy+Lz + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + + r = vget_low_f32(row3); + vResult0 = vmlaq_lane_f32(vResult0, V.val[3], r, 0); // Ax+Ey+Iz+Mw + vResult1 = vmlaq_lane_f32(vResult1, V.val[3], r, 1); // Bx+Fy+Jz+Nw + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 6)); + + r = vget_high_f32(row3); + vResult2 = vmlaq_lane_f32(vResult2, V.val[3], r, 0); // Cx+Gy+Kz+Ow + vResult3 = vmlaq_lane_f32(vResult3, V.val[3], r, 1); // Dx+Hy+Lz+Pw + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 7)); + + V.val[0] = vResult0; + V.val[1] = vResult1; + V.val[2] = vResult2; + V.val[3] = vResult3; + + vst4q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT4) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = vld1q_f32(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + float32x2_t VL = vget_low_f32(V); + XMVECTOR vResult = vmulq_lane_f32(row0, VL, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y + float32x2_t VH = vget_high_f32(V); + vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z + vResult = vmlaq_lane_f32(vResult, row3, VH, 1); // W + + vst1q_f32(reinterpret_cast(pOutputVector), vResult); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_AVX2_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t two = VectorCount >> 1; + if (two > 0) + { + __m256 row0 = _mm256_broadcast_ps(&M.r[0]); + __m256 row1 = _mm256_broadcast_ps(&M.r[1]); + __m256 row2 = _mm256_broadcast_ps(&M.r[2]); + __m256 row3 = _mm256_broadcast_ps(&M.r[3]); + + if (InputStride == sizeof(XMFLOAT4)) + { + if (OutputStride == sizeof(XMFLOAT4)) + { + if (!(reinterpret_cast(pOutputStream) & 0x1F)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < two; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT4) * 2; + + __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm256_mul_ps(vTempX, row0); + vTempY = _mm256_mul_ps(vTempY, row1); + vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX); + vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY); + vTempX = _mm256_add_ps(vTempZ, vTempW); + + XM256_STREAM_PS(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += sizeof(XMFLOAT4) * 2; + + i += 2; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < two; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT4) * 2; + + __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm256_mul_ps(vTempX, row0); + vTempY = _mm256_mul_ps(vTempY, row1); + vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX); + vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY); + vTempX = _mm256_add_ps(vTempZ, vTempW); + + _mm256_storeu_ps(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += sizeof(XMFLOAT4) * 2; + + i += 2; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < two; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT4) * 2; + + __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm256_mul_ps(vTempX, row0); + vTempY = _mm256_mul_ps(vTempY, row1); + vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX); + vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY); + vTempX = _mm256_add_ps(vTempZ, vTempW); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_castps256_ps128(vTempX)); + pOutputVector += OutputStride; + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_extractf128_ps(vTempX, 1)); + pOutputVector += OutputStride; + i += 2; + } + } + } + } + + if (i < VectorCount) + { + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (; i < VectorCount; i++) + { + __m128 V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm_mul_ps(vTempX, row0); + vTempY = _mm_mul_ps(vTempY, row1); + vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX); + vTempW = XM_FMADD_PS(vTempW, row3, vTempY); + vTempX = _mm_add_ps(vTempZ, vTempW); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) + { + if (!(reinterpret_cast(pInputStream) & 0xF) && !(InputStride & 0xF)) + { + // Aligned input, aligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_load_ps(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm_mul_ps(vTempX, row0); + vTempY = _mm_mul_ps(vTempY, row1); + vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX); + vTempW = XM_FMADD_PS(vTempW, row3, vTempY); + vTempX = _mm_add_ps(vTempZ, vTempW); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input, aligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm_mul_ps(vTempX, row0); + vTempY = _mm_mul_ps(vTempY, row1); + vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX); + vTempW = XM_FMADD_PS(vTempW, row3, vTempY); + vTempX = _mm_add_ps(vTempZ, vTempW); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += OutputStride; + } + } + } + else + { + if (!(reinterpret_cast(pInputStream) & 0xF) && !(InputStride & 0xF)) + { + // Aligned input, unaligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_load_ps(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm_mul_ps(vTempX, row0); + vTempY = _mm_mul_ps(vTempY, row1); + vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX); + vTempW = XM_FMADD_PS(vTempW, row3, vTempY); + vTempX = _mm_add_ps(vTempZ, vTempW); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input, unaligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm_mul_ps(vTempX, row0); + vTempY = _mm_mul_ps(vTempY, row1); + vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX); + vTempW = XM_FMADD_PS(vTempW, row3, vTempY); + vTempX = _mm_add_ps(vTempZ, vTempW); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += OutputStride; + } + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +/**************************************************************************** + * + * XMVECTOR operators + * + ****************************************************************************/ + +#ifndef _XM_NO_XMVECTOR_OVERLOADS_ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V) noexcept +{ + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator- (FXMVECTOR V) noexcept +{ + return XMVectorNegate(V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator+= +( + XMVECTOR& V1, + FXMVECTOR V2 + ) noexcept +{ + V1 = XMVectorAdd(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator-= +( + XMVECTOR& V1, + FXMVECTOR V2 + ) noexcept +{ + V1 = XMVectorSubtract(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator*= +( + XMVECTOR& V1, + FXMVECTOR V2 + ) noexcept +{ + V1 = XMVectorMultiply(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator/= +( + XMVECTOR& V1, + FXMVECTOR V2 + ) noexcept +{ + V1 = XMVectorDivide(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator*= +( + XMVECTOR& V, + const float S + ) noexcept +{ + V = XMVectorScale(V, S); + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator/= +( + XMVECTOR& V, + const float S + ) noexcept +{ + XMVECTOR vS = XMVectorReplicate(S); + V = XMVectorDivide(V, vS); + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator+ +( + FXMVECTOR V1, + FXMVECTOR V2 + ) noexcept +{ + return XMVectorAdd(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator- +( + FXMVECTOR V1, + FXMVECTOR V2 + ) noexcept +{ + return XMVectorSubtract(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator* +( + FXMVECTOR V1, + FXMVECTOR V2 + ) noexcept +{ + return XMVectorMultiply(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator/ +( + FXMVECTOR V1, + FXMVECTOR V2 + ) noexcept +{ + return XMVectorDivide(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator* +( + FXMVECTOR V, + const float S + ) noexcept +{ + return XMVectorScale(V, S); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator/ +( + FXMVECTOR V, + const float S + ) noexcept +{ + XMVECTOR vS = XMVectorReplicate(S); + return XMVectorDivide(V, vS); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator* +( + float S, + FXMVECTOR V + ) noexcept +{ + return XMVectorScale(V, S); +} + +#endif /* !_XM_NO_XMVECTOR_OVERLOADS_ */ + +#if defined(_XM_NO_INTRINSICS_) +#undef XMISNAN +#undef XMISINF +#endif + +#if defined(_XM_SSE_INTRINSICS_) +#undef XM3UNPACK3INTO4 +#undef XM3PACK4INTO3 +#endif + diff --git a/include/new.h b/include/new.h new file mode 100644 index 0000000..48b608a --- /dev/null +++ b/include/new.h @@ -0,0 +1,42 @@ +#pragma once + +#ifdef __aligned_alloc +#undef __aligned_alloc +#endif +#ifdef __alloca +#undef __alloca +#endif + +#ifdef _WIN32 +#include +#include +#define __aligned_alloc(alignment, size) _aligned_malloc(size, alignment) +#define __alloca(size) _malloca(size) +#else +#include +#define __aligned_alloc(alignment, size) aligned_alloc(alignment, size) +#define __alloca(size) alloca(size) +#endif + +template +T * New(int elements) +{ + return (T *)__aligned_alloc(16, (sizeof (T)) * elements); +} + +template +T * NewM(int elements) +{ + return (T *)malloc((sizeof (T)) * elements); +} + +/* +template +T * NewA(int elements) +{ + return (T *)__alloca((sizeof (T)) * elements); +} +*/ + +#undef __alloca +#undef __aligned_alloc diff --git a/include/sal.h b/include/sal.h new file mode 100644 index 0000000..9d461e8 --- /dev/null +++ b/include/sal.h @@ -0,0 +1,2941 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +/*** +*sal.h - markers for documenting the semantics of APIs +* + +* +*Purpose: +* sal.h provides a set of annotations to describe how a function uses its +* parameters - the assumptions it makes about them, and the guarantees it makes +* upon finishing. +****/ +#pragma once + +/*========================================================================== + + The comments in this file are intended to give basic understanding of + the usage of SAL, the Microsoft Source Code Annotation Language. + For more details, please see http://go.microsoft.com/fwlink/?LinkID=242134 + + The macros are defined in 3 layers, plus the structural set: + + _In_/_Out_/_Ret_ Layer: + ---------------------- + This layer provides the highest abstraction and its macros should be used + in most cases. These macros typically start with: + _In_ : input parameter to a function, unmodified by called function + _Out_ : output parameter, written to by called function, pointed-to + location not expected to be initialized prior to call + _Outptr_ : like _Out_ when returned variable is a pointer type + (so param is pointer-to-pointer type). Called function + provides/allocated space. + _Outref_ : like _Outptr_, except param is reference-to-pointer type. + _Inout_ : inout parameter, read from and potentially modified by + called function. + _Ret_ : for return values + _Field_ : class/struct field invariants + For common usage, this class of SAL provides the most concise annotations. + Note that _In_/_Out_/_Inout_/_Outptr_ annotations are designed to be used + with a parameter target. Using them with _At_ to specify non-parameter + targets may yield unexpected results. + + This layer also includes a number of other properties that can be specified + to extend the ability of code analysis, most notably: + -- Designating parameters as format strings for printf/scanf/scanf_s + -- Requesting stricter type checking for C enum parameters + + _Pre_/_Post_ Layer: + ------------------ + The macros of this layer only should be used when there is no suitable macro + in the _In_/_Out_ layer. Its macros start with _Pre_ or _Post_. + This layer provides the most flexibility for annotations. + + Implementation Abstraction Layer: + -------------------------------- + Macros from this layer should never be used directly. The layer only exists + to hide the implementation of the annotation macros. + + Structural Layer: + ---------------- + These annotations, like _At_ and _When_, are used with annotations from + any of the other layers as modifiers, indicating exactly when and where + the annotations apply. + + + Common syntactic conventions: + ---------------------------- + + Usage: + ----- + _In_, _Out_, _Inout_, _Pre_, _Post_, are for formal parameters. + _Ret_, _Deref_ret_ must be used for return values. + + Nullness: + -------- + If the parameter can be NULL as a precondition to the function, the + annotation contains _opt. If the macro does not contain '_opt' the + parameter cannot be NULL. + + If an out/inout parameter returns a null pointer as a postcondition, this is + indicated by _Ret_maybenull_ or _result_maybenull_. If the macro is not + of this form, then the result will not be NULL as a postcondition. + _Outptr_ - output value is not NULL + _Outptr_result_maybenull_ - output value might be NULL + + String Type: + ----------- + _z: NullTerminated string + for _In_ parameters the buffer must have the specified stringtype before the call + for _Out_ parameters the buffer must have the specified stringtype after the call + for _Inout_ parameters both conditions apply + + Extent Syntax: + ------------- + Buffer sizes are expressed as element counts, unless the macro explicitly + contains _byte_ or _bytes_. Some annotations specify two buffer sizes, in + which case the second is used to indicate how much of the buffer is valid + as a postcondition. This table outlines the precondition buffer allocation + size, precondition number of valid elements, postcondition allocation size, + and postcondition number of valid elements for representative buffer size + annotations: + Pre | Pre | Post | Post + alloc | valid | alloc | valid + Annotation elems | elems | elems | elems + ---------- ------------------------------------ + _In_reads_(s) s | s | s | s + _Inout_updates_(s) s | s | s | s + _Inout_updates_to_(s,c) s | s | s | c + _Out_writes_(s) s | 0 | s | s + _Out_writes_to_(s,c) s | 0 | s | c + _Outptr_result_buffer_(s) ? | ? | s | s + _Outptr_result_buffer_to_(s,c) ? | ? | s | c + + For the _Outptr_ annotations, the buffer in question is at one level of + dereference. The called function is responsible for supplying the buffer. + + Success and failure: + ------------------- + The SAL concept of success allows functions to define expressions that can + be tested by the caller, which if it evaluates to non-zero, indicates the + function succeeded, which means that its postconditions are guaranteed to + hold. Otherwise, if the expression evaluates to zero, the function is + considered to have failed, and the postconditions are not guaranteed. + + The success criteria can be specified with the _Success_(expr) annotation: + _Success_(return != FALSE) BOOL + PathCanonicalizeA(_Out_writes_(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) : + pszBuf is only guaranteed to be NULL-terminated when TRUE is returned, + and FALSE indicates failure. In common practice, callers check for zero + vs. non-zero returns, so it is preferable to express the success + criteria in terms of zero/non-zero, not checked for exactly TRUE. + + Functions can specify that some postconditions will still hold, even when + the function fails, using _On_failure_(anno-list), or postconditions that + hold regardless of success or failure using _Always_(anno-list). + + The annotation _Return_type_success_(expr) may be used with a typedef to + give a default _Success_ criteria to all functions returning that type. + This is the case for common Windows API status types, including + HRESULT and NTSTATUS. This may be overridden on a per-function basis by + specifying a _Success_ annotation locally. + +============================================================================*/ + +#define __ATTR_SAL + +#ifndef _SAL_VERSION /*IFSTRIP=IGN*/ +#define _SAL_VERSION 20 +#endif + +#ifdef _PREFAST_ // [ + +// choose attribute or __declspec implementation +#ifndef _USE_DECLSPECS_FOR_SAL // [ +#define _USE_DECLSPECS_FOR_SAL 1 +#endif // ] + +#if _USE_DECLSPECS_FOR_SAL // [ +#undef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 0 +#elif !defined(_USE_ATTRIBUTES_FOR_SAL) // ][ +#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // [ +#define _USE_ATTRIBUTES_FOR_SAL 1 +#else // ][ +#define _USE_ATTRIBUTES_FOR_SAL 0 +#endif // ] +#endif // ] + + +#if !_USE_DECLSPECS_FOR_SAL // [ +#if !_USE_ATTRIBUTES_FOR_SAL // [ +#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // [ +#undef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 1 +#else // ][ +#undef _USE_DECLSPECS_FOR_SAL +#define _USE_DECLSPECS_FOR_SAL 1 +#endif // ] +#endif // ] +#endif // ] + +#else + +// Disable expansion of SAL macros in non-Prefast mode to +// improve compiler throughput. +#ifndef _USE_DECLSPECS_FOR_SAL // [ +#define _USE_DECLSPECS_FOR_SAL 0 +#endif // ] +#ifndef _USE_ATTRIBUTES_FOR_SAL // [ +#define _USE_ATTRIBUTES_FOR_SAL 0 +#endif // ] + +#endif // ] + +// safeguard for MIDL and RC builds +#if _USE_DECLSPECS_FOR_SAL && ( defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) || !defined(_PREFAST_) ) /*IFSTRIP=IGN*/ // [ +#undef _USE_DECLSPECS_FOR_SAL +#define _USE_DECLSPECS_FOR_SAL 0 +#endif // ] +#if _USE_ATTRIBUTES_FOR_SAL && ( !defined(_MSC_EXTENSIONS) || defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) ) /*IFSTRIP=IGN*/ // [ +#undef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 0 +#endif // ] + +#if _USE_DECLSPECS_FOR_SAL || _USE_ATTRIBUTES_FOR_SAL + +// Special enum type for Y/N/M +enum __SAL_YesNo {_SAL_notpresent, _SAL_no, _SAL_maybe, _SAL_yes, _SAL_default}; + +#endif + +#if defined(BUILD_WINDOWS) && !_USE_ATTRIBUTES_FOR_SAL /*IFSTRIP=IGN*/ +#define _SAL1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1") _GrouP_(annotes _SAL_nop_impl_) +#define _SAL1_1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.1") _GrouP_(annotes _SAL_nop_impl_) +#define _SAL1_2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.2") _GrouP_(annotes _SAL_nop_impl_) +#define _SAL2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "2") _GrouP_(annotes _SAL_nop_impl_) +#else +#define _SAL1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1") _Group_(annotes _SAL_nop_impl_) +#define _SAL1_1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.1") _Group_(annotes _SAL_nop_impl_) +#define _SAL1_2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.2") _Group_(annotes _SAL_nop_impl_) +#define _SAL2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "2") _Group_(annotes _SAL_nop_impl_) +#endif + +//============================================================================ +// Structural SAL: +// These annotations modify the use of other annotations. They may +// express the annotation target (i.e. what parameter/field the annotation +// applies to) or the condition under which the annotation is applicable. +//============================================================================ + +// _At_(target, annos) specifies that the annotations listed in 'annos' is to +// be applied to 'target' rather than to the identifier which is the current +// lexical target. +#define _At_(target, annos) _At_impl_(target, annos _SAL_nop_impl_) + +// _At_buffer_(target, iter, bound, annos) is similar to _At_, except that +// target names a buffer, and each annotation in annos is applied to each +// element of target up to bound, with the variable named in iter usable +// by the annotations to refer to relevant offsets within target. +#define _At_buffer_(target, iter, bound, annos) _At_buffer_impl_(target, iter, bound, annos _SAL_nop_impl_) + +// _When_(expr, annos) specifies that the annotations listed in 'annos' only +// apply when 'expr' evaluates to non-zero. +#define _When_(expr, annos) _When_impl_(expr, annos _SAL_nop_impl_) +#define _Group_(annos) _Group_impl_(annos _SAL_nop_impl_) +#define _GrouP_(annos) _GrouP_impl_(annos _SAL_nop_impl_) + +// indicates whether normal post conditions apply to a function +#define _Success_(expr) _SAL2_Source_(_Success_, (expr), _Success_impl_(expr)) + +// indicates whether post conditions apply to a function returning +// the type that this annotation is applied to +#define _Return_type_success_(expr) _SAL2_Source_(_Return_type_success_, (expr), _Success_impl_(expr)) + +// Establish postconditions that apply only if the function does not succeed +#define _On_failure_(annos) _On_failure_impl_(annos _SAL_nop_impl_) + +// Establish postconditions that apply in both success and failure cases. +// Only applicable with functions that have _Success_ or _Return_type_succss_. +#define _Always_(annos) _Always_impl_(annos _SAL_nop_impl_) + +// Usable on a function definition. Asserts that a function declaration is +// in scope, and its annotations are to be used. There are no other annotations +// allowed on the function definition. +#define _Use_decl_annotations_ _Use_decl_anno_impl_ + +// _Notref_ may precede a _Deref_ or "real" annotation, and removes one +// level of dereference if the parameter is a C++ reference (&). If the +// net deref on a "real" annotation is negative, it is simply discarded. +#define _Notref_ _Notref_impl_ + +// Annotations for defensive programming styles. +#define _Pre_defensive_ _SA_annotes0(SAL_pre_defensive) +#define _Post_defensive_ _SA_annotes0(SAL_post_defensive) + +#define _In_defensive_(annotes) _Pre_defensive_ _Group_(annotes) +#define _Out_defensive_(annotes) _Post_defensive_ _Group_(annotes) +#define _Inout_defensive_(annotes) _Pre_defensive_ _Post_defensive_ _Group_(annotes) + +//============================================================================ +// _In_\_Out_ Layer: +//============================================================================ + +// Reserved pointer parameters, must always be NULL. +#define _Reserved_ _SAL2_Source_(_Reserved_, (), _Pre1_impl_(__null_impl)) + +// _Const_ allows specification that any namable memory location is considered +// readonly for a given call. +#define _Const_ _SAL2_Source_(_Const_, (), _Pre1_impl_(__readaccess_impl_notref)) + + +// Input parameters -------------------------- + +// _In_ - Annotations for parameters where data is passed into the function, but not modified. +// _In_ by itself can be used with non-pointer types (although it is redundant). + +// e.g. void SetPoint( _In_ const POINT* pPT ); +#define _In_ _SAL2_Source_(_In_, (), _Pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_ _Deref_pre1_impl_(__readaccess_impl_notref)) +#define _In_opt_ _SAL2_Source_(_In_opt_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_ _Deref_pre_readonly_) + +// nullterminated 'in' parameters. +// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); +#define _In_z_ _SAL2_Source_(_In_z_, (), _In_ _Pre1_impl_(__zterm_impl)) +#define _In_opt_z_ _SAL2_Source_(_In_opt_z_, (), _In_opt_ _Pre1_impl_(__zterm_impl)) + + +// 'input' buffers with given size + +#define _In_reads_(size) _SAL2_Source_(_In_reads_, (size), _Pre_count_(size) _Deref_pre_readonly_) +#define _In_reads_opt_(size) _SAL2_Source_(_In_reads_opt_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_) +#define _In_reads_bytes_(size) _SAL2_Source_(_In_reads_bytes_, (size), _Pre_bytecount_(size) _Deref_pre_readonly_) +#define _In_reads_bytes_opt_(size) _SAL2_Source_(_In_reads_bytes_opt_, (size), _Pre_opt_bytecount_(size) _Deref_pre_readonly_) +#define _In_reads_z_(size) _SAL2_Source_(_In_reads_z_, (size), _In_reads_(size) _Pre_z_) +#define _In_reads_opt_z_(size) _SAL2_Source_(_In_reads_opt_z_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_ _Pre_opt_z_) +#define _In_reads_or_z_(size) _SAL2_Source_(_In_reads_or_z_, (size), _In_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size)))) +#define _In_reads_or_z_opt_(size) _SAL2_Source_(_In_reads_or_z_opt_, (size), _In_opt_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size)))) + + +// 'input' buffers valid to the given end pointer + +#define _In_reads_to_ptr_(ptr) _SAL2_Source_(_In_reads_to_ptr_, (ptr), _Pre_ptrdiff_count_(ptr) _Deref_pre_readonly_) +#define _In_reads_to_ptr_opt_(ptr) _SAL2_Source_(_In_reads_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_) +#define _In_reads_to_ptr_z_(ptr) _SAL2_Source_(_In_reads_to_ptr_z_, (ptr), _In_reads_to_ptr_(ptr) _Pre_z_) +#define _In_reads_to_ptr_opt_z_(ptr) _SAL2_Source_(_In_reads_to_ptr_opt_z_, (ptr), _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_ _Pre_opt_z_) + + + +// Output parameters -------------------------- + +// _Out_ - Annotations for pointer or reference parameters where data passed back to the caller. +// These are mostly used where the pointer/reference is to a non-pointer type. +// _Outptr_/_Outref) (see below) are typically used to return pointers via parameters. + +// e.g. void GetPoint( _Out_ POINT* pPT ); +#define _Out_ _SAL2_Source_(_Out_, (), _Out_impl_) +#define _Out_opt_ _SAL2_Source_(_Out_opt_, (), _Out_opt_impl_) + +#define _Out_writes_(size) _SAL2_Source_(_Out_writes_, (size), _Pre_cap_(size) _Post_valid_impl_) +#define _Out_writes_opt_(size) _SAL2_Source_(_Out_writes_opt_, (size), _Pre_opt_cap_(size) _Post_valid_impl_) +#define _Out_writes_bytes_(size) _SAL2_Source_(_Out_writes_bytes_, (size), _Pre_bytecap_(size) _Post_valid_impl_) +#define _Out_writes_bytes_opt_(size) _SAL2_Source_(_Out_writes_bytes_opt_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_) +#define _Out_writes_z_(size) _SAL2_Source_(_Out_writes_z_, (size), _Pre_cap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_writes_opt_z_(size) _SAL2_Source_(_Out_writes_opt_z_, (size), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_z_) + +#define _Out_writes_to_(size,count) _SAL2_Source_(_Out_writes_to_, (size,count), _Pre_cap_(size) _Post_valid_impl_ _Post_count_(count)) +#define _Out_writes_to_opt_(size,count) _SAL2_Source_(_Out_writes_to_opt_, (size,count), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_count_(count)) +#define _Out_writes_all_(size) _SAL2_Source_(_Out_writes_all_, (size), _Out_writes_to_(_Old_(size), _Old_(size))) +#define _Out_writes_all_opt_(size) _SAL2_Source_(_Out_writes_all_opt_, (size), _Out_writes_to_opt_(_Old_(size), _Old_(size))) + +#define _Out_writes_bytes_to_(size,count) _SAL2_Source_(_Out_writes_bytes_to_, (size,count), _Pre_bytecap_(size) _Post_valid_impl_ _Post_bytecount_(count)) +#define _Out_writes_bytes_to_opt_(size,count) _SAL2_Source_(_Out_writes_bytes_to_opt_, (size,count), _Pre_opt_bytecap_(size) _Post_valid_impl_ _Post_bytecount_(count)) +#define _Out_writes_bytes_all_(size) _SAL2_Source_(_Out_writes_bytes_all_, (size), _Out_writes_bytes_to_(_Old_(size), _Old_(size))) +#define _Out_writes_bytes_all_opt_(size) _SAL2_Source_(_Out_writes_bytes_all_opt_, (size), _Out_writes_bytes_to_opt_(_Old_(size), _Old_(size))) + +#define _Out_writes_to_ptr_(ptr) _SAL2_Source_(_Out_writes_to_ptr_, (ptr), _Pre_ptrdiff_cap_(ptr) _Post_valid_impl_) +#define _Out_writes_to_ptr_opt_(ptr) _SAL2_Source_(_Out_writes_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_) +#define _Out_writes_to_ptr_z_(ptr) _SAL2_Source_(_Out_writes_to_ptr_z_, (ptr), _Pre_ptrdiff_cap_(ptr) _Post_valid_impl_ Post_z_) +#define _Out_writes_to_ptr_opt_z_(ptr) _SAL2_Source_(_Out_writes_to_ptr_opt_z_, (ptr), _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_ Post_z_) + + +// Inout parameters ---------------------------- + +// _Inout_ - Annotations for pointer or reference parameters where data is passed in and +// potentially modified. +// void ModifyPoint( _Inout_ POINT* pPT ); +// void ModifyPointByRef( _Inout_ POINT& pPT ); + +#define _Inout_ _SAL2_Source_(_Inout_, (), _Prepost_valid_) +#define _Inout_opt_ _SAL2_Source_(_Inout_opt_, (), _Prepost_opt_valid_) + +// For modifying string buffers +// void toupper( _Inout_z_ char* sz ); +#define _Inout_z_ _SAL2_Source_(_Inout_z_, (), _Prepost_z_) +#define _Inout_opt_z_ _SAL2_Source_(_Inout_opt_z_, (), _Prepost_opt_z_) + +// For modifying buffers with explicit element size +#define _Inout_updates_(size) _SAL2_Source_(_Inout_updates_, (size), _Pre_cap_(size) _Pre_valid_impl_ _Post_valid_impl_) +#define _Inout_updates_opt_(size) _SAL2_Source_(_Inout_updates_opt_, (size), _Pre_opt_cap_(size) _Pre_valid_impl_ _Post_valid_impl_) +#define _Inout_updates_z_(size) _SAL2_Source_(_Inout_updates_z_, (size), _Pre_cap_(size) _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) _Post1_impl_(__zterm_impl)) +#define _Inout_updates_opt_z_(size) _SAL2_Source_(_Inout_updates_opt_z_, (size), _Pre_opt_cap_(size) _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) _Post1_impl_(__zterm_impl)) + +#define _Inout_updates_to_(size,count) _SAL2_Source_(_Inout_updates_to_, (size,count), _Out_writes_to_(size,count) _Pre_valid_impl_ _Pre1_impl_(__count_impl(count))) +#define _Inout_updates_to_opt_(size,count) _SAL2_Source_(_Inout_updates_to_opt_, (size,count), _Out_writes_to_opt_(size,count) _Pre_valid_impl_ _Pre1_impl_(__count_impl(count))) + +#define _Inout_updates_all_(size) _SAL2_Source_(_Inout_updates_all_, (size), _Inout_updates_to_(_Old_(size), _Old_(size))) +#define _Inout_updates_all_opt_(size) _SAL2_Source_(_Inout_updates_all_opt_, (size), _Inout_updates_to_opt_(_Old_(size), _Old_(size))) + +// For modifying buffers with explicit byte size +#define _Inout_updates_bytes_(size) _SAL2_Source_(_Inout_updates_bytes_, (size), _Pre_bytecap_(size) _Pre_valid_impl_ _Post_valid_impl_) +#define _Inout_updates_bytes_opt_(size) _SAL2_Source_(_Inout_updates_bytes_opt_, (size), _Pre_opt_bytecap_(size) _Pre_valid_impl_ _Post_valid_impl_) + +#define _Inout_updates_bytes_to_(size,count) _SAL2_Source_(_Inout_updates_bytes_to_, (size,count), _Out_writes_bytes_to_(size,count) _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count))) +#define _Inout_updates_bytes_to_opt_(size,count) _SAL2_Source_(_Inout_updates_bytes_to_opt_, (size,count), _Out_writes_bytes_to_opt_(size,count) _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count))) + +#define _Inout_updates_bytes_all_(size) _SAL2_Source_(_Inout_updates_bytes_all_, (size), _Inout_updates_bytes_to_(_Old_(size), _Old_(size))) +#define _Inout_updates_bytes_all_opt_(size) _SAL2_Source_(_Inout_updates_bytes_all_opt_, (size), _Inout_updates_bytes_to_opt_(_Old_(size), _Old_(size))) + + +// Pointer to pointer parameters ------------------------- + +// _Outptr_ - Annotations for output params returning pointers +// These describe parameters where the called function provides the buffer: +// HRESULT SHStrDupW(_In_ LPCWSTR psz, _Outptr_ LPWSTR *ppwsz); +// The caller passes the address of an LPWSTR variable as ppwsz, and SHStrDupW allocates +// and initializes memory and returns the pointer to the new LPWSTR in *ppwsz. +// +// _Outptr_opt_ - describes parameters that are allowed to be NULL. +// _Outptr_*_result_maybenull_ - describes parameters where the called function might return NULL to the caller. +// +// Example: +// void MyFunc(_Outptr_opt_ int **ppData1, _Outptr_result_maybenull_ int **ppData2); +// Callers: +// MyFunc(NULL, NULL); // error: parameter 2, ppData2, should not be NULL +// MyFunc(&pData1, &pData2); // ok: both non-NULL +// if (*pData1 == *pData2) ... // error: pData2 might be NULL after call + +#define _Outptr_ _SAL2_Source_(_Outptr_, (), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(1))) +#define _Outptr_result_maybenull_ _SAL2_Source_(_Outptr_result_maybenull_, (), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1))) +#define _Outptr_opt_ _SAL2_Source_(_Outptr_opt_, (), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(1))) +#define _Outptr_opt_result_maybenull_ _SAL2_Source_(_Outptr_opt_result_maybenull_, (), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1))) + +// Annotations for _Outptr_ parameters returning pointers to null terminated strings. + +#define _Outptr_result_z_ _SAL2_Source_(_Outptr_result_z_, (), _Out_impl_ _Deref_post_z_) +#define _Outptr_opt_result_z_ _SAL2_Source_(_Outptr_opt_result_z_, (), _Out_opt_impl_ _Deref_post_z_) +#define _Outptr_result_maybenull_z_ _SAL2_Source_(_Outptr_result_maybenull_z_, (), _Out_impl_ _Deref_post_opt_z_) +#define _Outptr_opt_result_maybenull_z_ _SAL2_Source_(_Outptr_opt_result_maybenull_z_, (), _Out_opt_impl_ _Deref_post_opt_z_) + +// Annotations for _Outptr_ parameters where the output pointer is set to NULL if the function fails. + +#define _Outptr_result_nullonfailure_ _SAL2_Source_(_Outptr_result_nullonfailure_, (), _Outptr_ _On_failure_(_Deref_post_null_)) +#define _Outptr_opt_result_nullonfailure_ _SAL2_Source_(_Outptr_opt_result_nullonfailure_, (), _Outptr_opt_ _On_failure_(_Deref_post_null_)) + +// Annotations for _Outptr_ parameters which return a pointer to a ref-counted COM object, +// following the COM convention of setting the output to NULL on failure. +// The current implementation is identical to _Outptr_result_nullonfailure_. +// For pointers to types that are not COM objects, _Outptr_result_nullonfailure_ is preferred. + +#define _COM_Outptr_ _SAL2_Source_(_COM_Outptr_, (), _Outptr_ _On_failure_(_Deref_post_null_)) +#define _COM_Outptr_result_maybenull_ _SAL2_Source_(_COM_Outptr_result_maybenull_, (), _Outptr_result_maybenull_ _On_failure_(_Deref_post_null_)) +#define _COM_Outptr_opt_ _SAL2_Source_(_COM_Outptr_opt_, (), _Outptr_opt_ _On_failure_(_Deref_post_null_)) +#define _COM_Outptr_opt_result_maybenull_ _SAL2_Source_(_COM_Outptr_opt_result_maybenull_, (), _Outptr_opt_result_maybenull_ _On_failure_(_Deref_post_null_)) + +// Annotations for _Outptr_ parameters returning a pointer to buffer with a specified number of elements/bytes + +#define _Outptr_result_buffer_(size) _SAL2_Source_(_Outptr_result_buffer_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size))) +#define _Outptr_opt_result_buffer_(size) _SAL2_Source_(_Outptr_opt_result_buffer_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size))) +#define _Outptr_result_buffer_to_(size, count) _SAL2_Source_(_Outptr_result_buffer_to_, (size, count), _Out_impl_ _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), __count_impl(count))) +#define _Outptr_opt_result_buffer_to_(size, count) _SAL2_Source_(_Outptr_opt_result_buffer_to_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), __count_impl(count))) + +#define _Outptr_result_buffer_all_(size) _SAL2_Source_(_Outptr_result_buffer_all_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size))) +#define _Outptr_opt_result_buffer_all_(size) _SAL2_Source_(_Outptr_opt_result_buffer_all_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size))) + +#define _Outptr_result_buffer_maybenull_(size) _SAL2_Source_(_Outptr_result_buffer_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size))) +#define _Outptr_opt_result_buffer_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_buffer_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size))) +#define _Outptr_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_result_buffer_to_maybenull_, (size, count), _Out_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), __count_impl(count))) +#define _Outptr_opt_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_opt_result_buffer_to_maybenull_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), __count_impl(count))) + +#define _Outptr_result_buffer_all_maybenull_(size) _SAL2_Source_(_Outptr_result_buffer_all_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size))) +#define _Outptr_opt_result_buffer_all_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_buffer_all_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size))) + +#define _Outptr_result_bytebuffer_(size) _SAL2_Source_(_Outptr_result_bytebuffer_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size))) +#define _Outptr_opt_result_bytebuffer_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size))) +#define _Outptr_result_bytebuffer_to_(size, count) _SAL2_Source_(_Outptr_result_bytebuffer_to_, (size, count), _Out_impl_ _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) +#define _Outptr_opt_result_bytebuffer_to_(size, count) _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) + +#define _Outptr_result_bytebuffer_all_(size) _SAL2_Source_(_Outptr_result_bytebuffer_all_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size))) +#define _Outptr_opt_result_bytebuffer_all_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_all_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size))) + +#define _Outptr_result_bytebuffer_maybenull_(size) _SAL2_Source_(_Outptr_result_bytebuffer_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size))) +#define _Outptr_opt_result_bytebuffer_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size))) +#define _Outptr_result_bytebuffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_result_bytebuffer_to_maybenull_, (size, count), _Out_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) +#define _Outptr_opt_result_bytebuffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_maybenull_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) + +#define _Outptr_result_bytebuffer_all_maybenull_(size) _SAL2_Source_(_Outptr_result_bytebuffer_all_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size))) +#define _Outptr_opt_result_bytebuffer_all_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_all_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size))) + +// Annotations for output reference to pointer parameters. + +#define _Outref_ _SAL2_Source_(_Outref_, (), _Out_impl_ _Post_notnull_) +#define _Outref_result_maybenull_ _SAL2_Source_(_Outref_result_maybenull_, (), _Pre2_impl_(__notnull_impl_notref, __cap_c_one_notref_impl) _Post_maybenull_ _Post_valid_impl_) + +#define _Outref_result_buffer_(size) _SAL2_Source_(_Outref_result_buffer_, (size), _Outref_ _Post1_impl_(__cap_impl(size))) +#define _Outref_result_bytebuffer_(size) _SAL2_Source_(_Outref_result_bytebuffer_, (size), _Outref_ _Post1_impl_(__bytecap_impl(size))) +#define _Outref_result_buffer_to_(size, count) _SAL2_Source_(_Outref_result_buffer_to_, (size, count), _Outref_result_buffer_(size) _Post1_impl_(__count_impl(count))) +#define _Outref_result_bytebuffer_to_(size, count) _SAL2_Source_(_Outref_result_bytebuffer_to_, (size, count), _Outref_result_bytebuffer_(size) _Post1_impl_(__bytecount_impl(count))) +#define _Outref_result_buffer_all_(size) _SAL2_Source_(_Outref_result_buffer_all_, (size), _Outref_result_buffer_to_(size, _Old_(size))) +#define _Outref_result_bytebuffer_all_(size) _SAL2_Source_(_Outref_result_bytebuffer_all_, (size), _Outref_result_bytebuffer_to_(size, _Old_(size))) + +#define _Outref_result_buffer_maybenull_(size) _SAL2_Source_(_Outref_result_buffer_maybenull_, (size), _Outref_result_maybenull_ _Post1_impl_(__cap_impl(size))) +#define _Outref_result_bytebuffer_maybenull_(size) _SAL2_Source_(_Outref_result_bytebuffer_maybenull_, (size), _Outref_result_maybenull_ _Post1_impl_(__bytecap_impl(size))) +#define _Outref_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outref_result_buffer_to_maybenull_, (size, count), _Outref_result_buffer_maybenull_(size) _Post1_impl_(__count_impl(count))) +#define _Outref_result_bytebuffer_to_maybenull_(size, count) _SAL2_Source_(_Outref_result_bytebuffer_to_maybenull_, (size, count), _Outref_result_bytebuffer_maybenull_(size) _Post1_impl_(__bytecount_impl(count))) +#define _Outref_result_buffer_all_maybenull_(size) _SAL2_Source_(_Outref_result_buffer_all_maybenull_, (size), _Outref_result_buffer_to_maybenull_(size, _Old_(size))) +#define _Outref_result_bytebuffer_all_maybenull_(size) _SAL2_Source_(_Outref_result_bytebuffer_all_maybenull_, (size), _Outref_result_bytebuffer_to_maybenull_(size, _Old_(size))) + +// Annotations for output reference to pointer parameters that guarantee +// that the pointer is set to NULL on failure. +#define _Outref_result_nullonfailure_ _SAL2_Source_(_Outref_result_nullonfailure_, (), _Outref_ _On_failure_(_Post_null_)) + +// Generic annotations to set output value of a by-pointer or by-reference parameter to null/zero on failure. +#define _Result_nullonfailure_ _SAL2_Source_(_Result_nullonfailure_, (), _On_failure_(_Notref_impl_ _Deref_impl_ _Post_null_)) +#define _Result_zeroonfailure_ _SAL2_Source_(_Result_zeroonfailure_, (), _On_failure_(_Notref_impl_ _Deref_impl_ _Out_range_(==, 0))) + + +// return values ------------------------------- + +// +// _Ret_ annotations +// +// describing conditions that hold for return values after the call + +// e.g. _Ret_z_ CString::operator const WCHAR*() const throw(); +#define _Ret_z_ _SAL2_Source_(_Ret_z_, (), _Ret2_impl_(__notnull_impl, __zterm_impl) _Ret_valid_impl_) +#define _Ret_maybenull_z_ _SAL2_Source_(_Ret_maybenull_z_, (), _Ret2_impl_(__maybenull_impl,__zterm_impl) _Ret_valid_impl_) + +// used with allocated but not yet initialized objects +#define _Ret_notnull_ _SAL2_Source_(_Ret_notnull_, (), _Ret1_impl_(__notnull_impl)) +#define _Ret_maybenull_ _SAL2_Source_(_Ret_maybenull_, (), _Ret1_impl_(__maybenull_impl)) +#define _Ret_null_ _SAL2_Source_(_Ret_null_, (), _Ret1_impl_(__null_impl)) + +// used with allocated and initialized objects +// returns single valid object +#define _Ret_valid_ _SAL2_Source_(_Ret_valid_, (), _Ret1_impl_(__notnull_impl_notref) _Ret_valid_impl_) + +// returns pointer to initialized buffer of specified size +#define _Ret_writes_(size) _SAL2_Source_(_Ret_writes_, (size), _Ret2_impl_(__notnull_impl, __count_impl(size)) _Ret_valid_impl_) +#define _Ret_writes_z_(size) _SAL2_Source_(_Ret_writes_z_, (size), _Ret3_impl_(__notnull_impl, __count_impl(size), __zterm_impl) _Ret_valid_impl_) +#define _Ret_writes_bytes_(size) _SAL2_Source_(_Ret_writes_bytes_, (size), _Ret2_impl_(__notnull_impl, __bytecount_impl(size)) _Ret_valid_impl_) +#define _Ret_writes_maybenull_(size) _SAL2_Source_(_Ret_writes_maybenull_, (size), _Ret2_impl_(__maybenull_impl,__count_impl(size)) _Ret_valid_impl_) +#define _Ret_writes_maybenull_z_(size) _SAL2_Source_(_Ret_writes_maybenull_z_, (size), _Ret3_impl_(__maybenull_impl,__count_impl(size),__zterm_impl) _Ret_valid_impl_) +#define _Ret_writes_bytes_maybenull_(size) _SAL2_Source_(_Ret_writes_bytes_maybenull_, (size), _Ret2_impl_(__maybenull_impl,__bytecount_impl(size)) _Ret_valid_impl_) + +// returns pointer to partially initialized buffer, with total size 'size' and initialized size 'count' +#define _Ret_writes_to_(size,count) _SAL2_Source_(_Ret_writes_to_, (size,count), _Ret3_impl_(__notnull_impl, __cap_impl(size), __count_impl(count)) _Ret_valid_impl_) +#define _Ret_writes_bytes_to_(size,count) _SAL2_Source_(_Ret_writes_bytes_to_, (size,count), _Ret3_impl_(__notnull_impl, __bytecap_impl(size), __bytecount_impl(count)) _Ret_valid_impl_) +#define _Ret_writes_to_maybenull_(size,count) _SAL2_Source_(_Ret_writes_to_maybenull_, (size,count), _Ret3_impl_(__maybenull_impl, __cap_impl(size), __count_impl(count)) _Ret_valid_impl_) +#define _Ret_writes_bytes_to_maybenull_(size,count) _SAL2_Source_(_Ret_writes_bytes_to_maybenull_, (size,count), _Ret3_impl_(__maybenull_impl, __bytecap_impl(size), __bytecount_impl(count)) _Ret_valid_impl_) + + +// Annotations for strict type checking +#define _Points_to_data_ _SAL2_Source_(_Points_to_data_, (), _Pre_ _Points_to_data_impl_) +#define _Literal_ _SAL2_Source_(_Literal_, (), _Pre_ _Literal_impl_) +#define _Notliteral_ _SAL2_Source_(_Notliteral_, (), _Pre_ _Notliteral_impl_) + +// Check the return value of a function e.g. _Check_return_ ErrorCode Foo(); +#define _Check_return_ _SAL2_Source_(_Check_return_, (), _Check_return_impl_) +#define _Must_inspect_result_ _SAL2_Source_(_Must_inspect_result_, (), _Must_inspect_impl_ _Check_return_impl_) + +// e.g. MyPrintF( _Printf_format_string_ const WCHAR* wzFormat, ... ); +#define _Printf_format_string_ _SAL2_Source_(_Printf_format_string_, (), _Printf_format_string_impl_) +#define _Scanf_format_string_ _SAL2_Source_(_Scanf_format_string_, (), _Scanf_format_string_impl_) +#define _Scanf_s_format_string_ _SAL2_Source_(_Scanf_s_format_string_, (), _Scanf_s_format_string_impl_) + +#define _Format_string_impl_(kind,where) _SA_annotes2(SAL_IsFormatString2, kind, where) +#define _Printf_format_string_params_(x) _SAL2_Source_(_Printf_format_string_params_, (x), _Format_string_impl_("printf", x)) +#define _Scanf_format_string_params_(x) _SAL2_Source_(_Scanf_format_string_params_, (x), _Format_string_impl_("scanf", x)) +#define _Scanf_s_format_string_params_(x) _SAL2_Source_(_Scanf_s_format_string_params_, (x), _Format_string_impl_("scanf_s", x)) + +// annotations to express value of integral or pointer parameter +#define _In_range_(lb,ub) _SAL2_Source_(_In_range_, (lb,ub), _In_range_impl_(lb,ub)) +#define _Out_range_(lb,ub) _SAL2_Source_(_Out_range_, (lb,ub), _Out_range_impl_(lb,ub)) +#define _Ret_range_(lb,ub) _SAL2_Source_(_Ret_range_, (lb,ub), _Ret_range_impl_(lb,ub)) +#define _Deref_in_range_(lb,ub) _SAL2_Source_(_Deref_in_range_, (lb,ub), _Deref_in_range_impl_(lb,ub)) +#define _Deref_out_range_(lb,ub) _SAL2_Source_(_Deref_out_range_, (lb,ub), _Deref_out_range_impl_(lb,ub)) +#define _Deref_ret_range_(lb,ub) _SAL2_Source_(_Deref_ret_range_, (lb,ub), _Deref_ret_range_impl_(lb,ub)) +#define _Pre_equal_to_(expr) _SAL2_Source_(_Pre_equal_to_, (expr), _In_range_(==, expr)) +#define _Post_equal_to_(expr) _SAL2_Source_(_Post_equal_to_, (expr), _Out_range_(==, expr)) + +// annotation to express that a value (usually a field of a mutable class) +// is not changed by a function call +#define _Unchanged_(e) _SAL2_Source_(_Unchanged_, (e), _At_(e, _Post_equal_to_(_Old_(e)) _Const_)) + +// Annotations to allow expressing generalized pre and post conditions. +// 'cond' may be any valid SAL expression that is considered to be true as a precondition +// or postcondition (respsectively). +#define _Pre_satisfies_(cond) _SAL2_Source_(_Pre_satisfies_, (cond), _Pre_satisfies_impl_(cond)) +#define _Post_satisfies_(cond) _SAL2_Source_(_Post_satisfies_, (cond), _Post_satisfies_impl_(cond)) + +// Annotations to express struct, class and field invariants +#define _Struct_size_bytes_(size) _SAL2_Source_(_Struct_size_bytes_, (size), _Writable_bytes_(size)) + +#define _Field_size_(size) _SAL2_Source_(_Field_size_, (size), _Notnull_ _Writable_elements_(size)) +#define _Field_size_opt_(size) _SAL2_Source_(_Field_size_opt_, (size), _Maybenull_ _Writable_elements_(size)) +#define _Field_size_part_(size, count) _SAL2_Source_(_Field_size_part_, (size, count), _Notnull_ _Writable_elements_(size) _Readable_elements_(count)) +#define _Field_size_part_opt_(size, count) _SAL2_Source_(_Field_size_part_opt_, (size, count), _Maybenull_ _Writable_elements_(size) _Readable_elements_(count)) +#define _Field_size_full_(size) _SAL2_Source_(_Field_size_full_, (size), _Field_size_part_(size, size)) +#define _Field_size_full_opt_(size) _SAL2_Source_(_Field_size_full_opt_, (size), _Field_size_part_opt_(size, size)) + +#define _Field_size_bytes_(size) _SAL2_Source_(_Field_size_bytes_, (size), _Notnull_ _Writable_bytes_(size)) +#define _Field_size_bytes_opt_(size) _SAL2_Source_(_Field_size_bytes_opt_, (size), _Maybenull_ _Writable_bytes_(size)) +#define _Field_size_bytes_part_(size, count) _SAL2_Source_(_Field_size_bytes_part_, (size, count), _Notnull_ _Writable_bytes_(size) _Readable_bytes_(count)) +#define _Field_size_bytes_part_opt_(size, count) _SAL2_Source_(_Field_size_bytes_part_opt_, (size, count), _Maybenull_ _Writable_bytes_(size) _Readable_bytes_(count)) +#define _Field_size_bytes_full_(size) _SAL2_Source_(_Field_size_bytes_full_, (size), _Field_size_bytes_part_(size, size)) +#define _Field_size_bytes_full_opt_(size) _SAL2_Source_(_Field_size_bytes_full_opt_, (size), _Field_size_bytes_part_opt_(size, size)) + +#define _Field_z_ _SAL2_Source_(_Field_z_, (), _Null_terminated_) + +#define _Field_range_(min,max) _SAL2_Source_(_Field_range_, (min,max), _Field_range_impl_(min,max)) + +//============================================================================ +// _Pre_\_Post_ Layer: +//============================================================================ + +// +// Raw Pre/Post for declaring custom pre/post conditions +// + +#define _Pre_ _Pre_impl_ +#define _Post_ _Post_impl_ + +// +// Validity property +// + +#define _Valid_ _Valid_impl_ +#define _Notvalid_ _Notvalid_impl_ +#define _Maybevalid_ _Maybevalid_impl_ + +// +// Buffer size properties +// + +// Expressing buffer sizes without specifying pre or post condition +#define _Readable_bytes_(size) _SAL2_Source_(_Readable_bytes_, (size), _Readable_bytes_impl_(size)) +#define _Readable_elements_(size) _SAL2_Source_(_Readable_elements_, (size), _Readable_elements_impl_(size)) +#define _Writable_bytes_(size) _SAL2_Source_(_Writable_bytes_, (size), _Writable_bytes_impl_(size)) +#define _Writable_elements_(size) _SAL2_Source_(_Writable_elements_, (size), _Writable_elements_impl_(size)) + +#define _Null_terminated_ _SAL2_Source_(_Null_terminated_, (), _Null_terminated_impl_) +#define _NullNull_terminated_ _SAL2_Source_(_NullNull_terminated_, (), _NullNull_terminated_impl_) + +// Expressing buffer size as pre or post condition +#define _Pre_readable_size_(size) _SAL2_Source_(_Pre_readable_size_, (size), _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Pre_writable_size_(size) _SAL2_Source_(_Pre_writable_size_, (size), _Pre1_impl_(__cap_impl(size))) +#define _Pre_readable_byte_size_(size) _SAL2_Source_(_Pre_readable_byte_size_, (size), _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) +#define _Pre_writable_byte_size_(size) _SAL2_Source_(_Pre_writable_byte_size_, (size), _Pre1_impl_(__bytecap_impl(size))) + +#define _Post_readable_size_(size) _SAL2_Source_(_Post_readable_size_, (size), _Post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Post_writable_size_(size) _SAL2_Source_(_Post_writable_size_, (size), _Post1_impl_(__cap_impl(size))) +#define _Post_readable_byte_size_(size) _SAL2_Source_(_Post_readable_byte_size_, (size), _Post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) +#define _Post_writable_byte_size_(size) _SAL2_Source_(_Post_writable_byte_size_, (size), _Post1_impl_(__bytecap_impl(size))) + +// +// Pointer null-ness properties +// +#define _Null_ _Null_impl_ +#define _Notnull_ _Notnull_impl_ +#define _Maybenull_ _Maybenull_impl_ + +// +// _Pre_ annotations --- +// +// describing conditions that must be met before the call of the function + +// e.g. int strlen( _Pre_z_ const char* sz ); +// buffer is a zero terminated string +#define _Pre_z_ _SAL2_Source_(_Pre_z_, (), _Pre1_impl_(__zterm_impl) _Pre_valid_impl_) + +// valid size unknown or indicated by type (e.g.:LPSTR) +#define _Pre_valid_ _SAL2_Source_(_Pre_valid_, (), _Pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_) +#define _Pre_opt_valid_ _SAL2_Source_(_Pre_opt_valid_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_) + +#define _Pre_invalid_ _SAL2_Source_(_Pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl)) + +// Overrides recursive valid when some field is not yet initialized when using _Inout_ +#define _Pre_unknown_ _SAL2_Source_(_Pre_unknown_, (), _Pre1_impl_(__maybevalid_impl)) + +// used with allocated but not yet initialized objects +#define _Pre_notnull_ _SAL2_Source_(_Pre_notnull_, (), _Pre1_impl_(__notnull_impl_notref)) +#define _Pre_maybenull_ _SAL2_Source_(_Pre_maybenull_, (), _Pre1_impl_(__maybenull_impl_notref)) +#define _Pre_null_ _SAL2_Source_(_Pre_null_, (), _Pre1_impl_(__null_impl_notref)) + +// +// _Post_ annotations --- +// +// describing conditions that hold after the function call + +// void CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_ char* szFrom, size_t cchFrom ); +// buffer will be a zero-terminated string after the call +#define _Post_z_ _SAL2_Source_(_Post_z_, (), _Post1_impl_(__zterm_impl) _Post_valid_impl_) + +// e.g. HRESULT InitStruct( _Post_valid_ Struct* pobj ); +#define _Post_valid_ _SAL2_Source_(_Post_valid_, (), _Post_valid_impl_) +#define _Post_invalid_ _SAL2_Source_(_Post_invalid_, (), _Deref_post1_impl_(__notvalid_impl)) + +// e.g. void free( _Post_ptr_invalid_ void* pv ); +#define _Post_ptr_invalid_ _SAL2_Source_(_Post_ptr_invalid_, (), _Post1_impl_(__notvalid_impl)) + +// e.g. void ThrowExceptionIfNull( _Post_notnull_ const void* pv ); +#define _Post_notnull_ _SAL2_Source_(_Post_notnull_, (), _Post1_impl_(__notnull_impl)) + +// e.g. HRESULT GetObject(_Outptr_ _On_failure_(_At_(*p, _Post_null_)) T **p); +#define _Post_null_ _SAL2_Source_(_Post_null_, (), _Post1_impl_(__null_impl)) + +#define _Post_maybenull_ _SAL2_Source_(_Post_maybenull_, (), _Post1_impl_(__maybenull_impl)) + +#define _Prepost_z_ _SAL2_Source_(_Prepost_z_, (), _Pre_z_ _Post_z_) + + +// #pragma region Input Buffer SAL 1 compatibility macros + +/*========================================================================== + + This section contains definitions for macros defined for VS2010 and earlier. + Usage of these macros is still supported, but the SAL 2 macros defined above + are recommended instead. This comment block is retained to assist in + understanding SAL that still uses the older syntax. + + The macros are defined in 3 layers: + + _In_\_Out_ Layer: + ---------------- + This layer provides the highest abstraction and its macros should be used + in most cases. Its macros start with _In_, _Out_ or _Inout_. For the + typical case they provide the most concise annotations. + + _Pre_\_Post_ Layer: + ------------------ + The macros of this layer only should be used when there is no suitable macro + in the _In_\_Out_ layer. Its macros start with _Pre_, _Post_, _Ret_, + _Deref_pre_ _Deref_post_ and _Deref_ret_. This layer provides the most + flexibility for annotations. + + Implementation Abstraction Layer: + -------------------------------- + Macros from this layer should never be used directly. The layer only exists + to hide the implementation of the annotation macros. + + + Annotation Syntax: + |--------------|----------|----------------|-----------------------------| + | Usage | Nullness | ZeroTerminated | Extent | + |--------------|----------|----------------|-----------------------------| + | _In_ | <> | <> | <> | + | _Out_ | opt_ | z_ | [byte]cap_[c_|x_]( size ) | + | _Inout_ | | | [byte]count_[c_|x_]( size ) | + | _Deref_out_ | | | ptrdiff_cap_( ptr ) | + |--------------| | | ptrdiff_count_( ptr ) | + | _Ret_ | | | | + | _Deref_ret_ | | | | + |--------------| | | | + | _Pre_ | | | | + | _Post_ | | | | + | _Deref_pre_ | | | | + | _Deref_post_ | | | | + |--------------|----------|----------------|-----------------------------| + + Usage: + ----- + _In_, _Out_, _Inout_, _Pre_, _Post_, _Deref_pre_, _Deref_post_ are for + formal parameters. + _Ret_, _Deref_ret_ must be used for return values. + + Nullness: + -------- + If the pointer can be NULL the annotation contains _opt. If the macro + does not contain '_opt' the pointer may not be NULL. + + String Type: + ----------- + _z: NullTerminated string + for _In_ parameters the buffer must have the specified stringtype before the call + for _Out_ parameters the buffer must have the specified stringtype after the call + for _Inout_ parameters both conditions apply + + Extent Syntax: + |------|---------------|---------------| + | Unit | Writ\Readable | Argument Type | + |------|---------------|---------------| + | <> | cap_ | <> | + | byte | count_ | c_ | + | | | x_ | + |------|---------------|---------------| + + 'cap' (capacity) describes the writable size of the buffer and is typically used + with _Out_. The default unit is elements. Use 'bytecap' if the size is given in bytes + 'count' describes the readable size of the buffer and is typically used with _In_. + The default unit is elements. Use 'bytecount' if the size is given in bytes. + + Argument syntax for cap_, bytecap_, count_, bytecount_: + (|return)[+n] e.g. cch, return, cb+2 + + If the buffer size is a constant expression use the c_ postfix. + E.g. cap_c_(20), count_c_(MAX_PATH), bytecount_c_(16) + + If the buffer size is given by a limiting pointer use the ptrdiff_ versions + of the macros. + + If the buffer size is neither a parameter nor a constant expression use the x_ + postfix. e.g. bytecount_x_(num*size) x_ annotations accept any arbitrary string. + No analysis can be done for x_ annotations but they at least tell the tool that + the buffer has some sort of extent description. x_ annotations might be supported + by future compiler versions. + +============================================================================*/ + +// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch ) +// valid buffer extent described by another parameter +#define _In_count_(size) _SAL1_1_Source_(_In_count_, (size), _Pre_count_(size) _Deref_pre_readonly_) +#define _In_opt_count_(size) _SAL1_1_Source_(_In_opt_count_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_) +#define _In_bytecount_(size) _SAL1_1_Source_(_In_bytecount_, (size), _Pre_bytecount_(size) _Deref_pre_readonly_) +#define _In_opt_bytecount_(size) _SAL1_1_Source_(_In_opt_bytecount_, (size), _Pre_opt_bytecount_(size) _Deref_pre_readonly_) + +// valid buffer extent described by a constant extression +#define _In_count_c_(size) _SAL1_1_Source_(_In_count_c_, (size), _Pre_count_c_(size) _Deref_pre_readonly_) +#define _In_opt_count_c_(size) _SAL1_1_Source_(_In_opt_count_c_, (size), _Pre_opt_count_c_(size) _Deref_pre_readonly_) +#define _In_bytecount_c_(size) _SAL1_1_Source_(_In_bytecount_c_, (size), _Pre_bytecount_c_(size) _Deref_pre_readonly_) +#define _In_opt_bytecount_c_(size) _SAL1_1_Source_(_In_opt_bytecount_c_, (size), _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_) + +// nullterminated 'input' buffers with given size + +// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch ) +// nullterminated valid buffer extent described by another parameter +#define _In_z_count_(size) _SAL1_1_Source_(_In_z_count_, (size), _Pre_z_ _Pre_count_(size) _Deref_pre_readonly_) +#define _In_opt_z_count_(size) _SAL1_1_Source_(_In_opt_z_count_, (size), _Pre_opt_z_ _Pre_opt_count_(size) _Deref_pre_readonly_) +#define _In_z_bytecount_(size) _SAL1_1_Source_(_In_z_bytecount_, (size), _Pre_z_ _Pre_bytecount_(size) _Deref_pre_readonly_) +#define _In_opt_z_bytecount_(size) _SAL1_1_Source_(_In_opt_z_bytecount_, (size), _Pre_opt_z_ _Pre_opt_bytecount_(size) _Deref_pre_readonly_) + +// nullterminated valid buffer extent described by a constant extression +#define _In_z_count_c_(size) _SAL1_1_Source_(_In_z_count_c_, (size), _Pre_z_ _Pre_count_c_(size) _Deref_pre_readonly_) +#define _In_opt_z_count_c_(size) _SAL1_1_Source_(_In_opt_z_count_c_, (size), _Pre_opt_z_ _Pre_opt_count_c_(size) _Deref_pre_readonly_) +#define _In_z_bytecount_c_(size) _SAL1_1_Source_(_In_z_bytecount_c_, (size), _Pre_z_ _Pre_bytecount_c_(size) _Deref_pre_readonly_) +#define _In_opt_z_bytecount_c_(size) _SAL1_1_Source_(_In_opt_z_bytecount_c_, (size), _Pre_opt_z_ _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_) + +// buffer capacity is described by another pointer +// e.g. void Foo( _In_ptrdiff_count_(pchMax) const char* pch, const char* pchMax ) { while pch < pchMax ) pch++; } +#define _In_ptrdiff_count_(size) _SAL1_1_Source_(_In_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size) _Deref_pre_readonly_) +#define _In_opt_ptrdiff_count_(size) _SAL1_1_Source_(_In_opt_ptrdiff_count_, (size), _Pre_opt_ptrdiff_count_(size) _Deref_pre_readonly_) + +// 'x' version for complex expressions that are not supported by the current compiler version +// e.g. void Set3ColMatrix( _In_count_x_(3*cRows) const Elem* matrix, int cRows ); +#define _In_count_x_(size) _SAL1_1_Source_(_In_count_x_, (size), _Pre_count_x_(size) _Deref_pre_readonly_) +#define _In_opt_count_x_(size) _SAL1_1_Source_(_In_opt_count_x_, (size), _Pre_opt_count_x_(size) _Deref_pre_readonly_) +#define _In_bytecount_x_(size) _SAL1_1_Source_(_In_bytecount_x_, (size), _Pre_bytecount_x_(size) _Deref_pre_readonly_) +#define _In_opt_bytecount_x_(size) _SAL1_1_Source_(_In_opt_bytecount_x_, (size), _Pre_opt_bytecount_x_(size) _Deref_pre_readonly_) + + +// 'out' with buffer size +// e.g. void GetIndices( _Out_cap_(cIndices) int* rgIndices, size_t cIndices ); +// buffer capacity is described by another parameter +#define _Out_cap_(size) _SAL1_1_Source_(_Out_cap_, (size), _Pre_cap_(size) _Post_valid_impl_) +#define _Out_opt_cap_(size) _SAL1_1_Source_(_Out_opt_cap_, (size), _Pre_opt_cap_(size) _Post_valid_impl_) +#define _Out_bytecap_(size) _SAL1_1_Source_(_Out_bytecap_, (size), _Pre_bytecap_(size) _Post_valid_impl_) +#define _Out_opt_bytecap_(size) _SAL1_1_Source_(_Out_opt_bytecap_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_) + +// buffer capacity is described by a constant expression +#define _Out_cap_c_(size) _SAL1_1_Source_(_Out_cap_c_, (size), _Pre_cap_c_(size) _Post_valid_impl_) +#define _Out_opt_cap_c_(size) _SAL1_1_Source_(_Out_opt_cap_c_, (size), _Pre_opt_cap_c_(size) _Post_valid_impl_) +#define _Out_bytecap_c_(size) _SAL1_1_Source_(_Out_bytecap_c_, (size), _Pre_bytecap_c_(size) _Post_valid_impl_) +#define _Out_opt_bytecap_c_(size) _SAL1_1_Source_(_Out_opt_bytecap_c_, (size), _Pre_opt_bytecap_c_(size) _Post_valid_impl_) + +// buffer capacity is described by another parameter multiplied by a constant expression +#define _Out_cap_m_(mult,size) _SAL1_1_Source_(_Out_cap_m_, (mult,size), _Pre_cap_m_(mult,size) _Post_valid_impl_) +#define _Out_opt_cap_m_(mult,size) _SAL1_1_Source_(_Out_opt_cap_m_, (mult,size), _Pre_opt_cap_m_(mult,size) _Post_valid_impl_) +#define _Out_z_cap_m_(mult,size) _SAL1_1_Source_(_Out_z_cap_m_, (mult,size), _Pre_cap_m_(mult,size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_m_(mult,size) _SAL1_1_Source_(_Out_opt_z_cap_m_, (mult,size), _Pre_opt_cap_m_(mult,size) _Post_valid_impl_ _Post_z_) + +// buffer capacity is described by another pointer +// e.g. void Foo( _Out_ptrdiff_cap_(pchMax) char* pch, const char* pchMax ) { while pch < pchMax ) pch++; } +#define _Out_ptrdiff_cap_(size) _SAL1_1_Source_(_Out_ptrdiff_cap_, (size), _Pre_ptrdiff_cap_(size) _Post_valid_impl_) +#define _Out_opt_ptrdiff_cap_(size) _SAL1_1_Source_(_Out_opt_ptrdiff_cap_, (size), _Pre_opt_ptrdiff_cap_(size) _Post_valid_impl_) + +// buffer capacity is described by a complex expression +#define _Out_cap_x_(size) _SAL1_1_Source_(_Out_cap_x_, (size), _Pre_cap_x_(size) _Post_valid_impl_) +#define _Out_opt_cap_x_(size) _SAL1_1_Source_(_Out_opt_cap_x_, (size), _Pre_opt_cap_x_(size) _Post_valid_impl_) +#define _Out_bytecap_x_(size) _SAL1_1_Source_(_Out_bytecap_x_, (size), _Pre_bytecap_x_(size) _Post_valid_impl_) +#define _Out_opt_bytecap_x_(size) _SAL1_1_Source_(_Out_opt_bytecap_x_, (size), _Pre_opt_bytecap_x_(size) _Post_valid_impl_) + +// a zero terminated string is filled into a buffer of given capacity +// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); +// buffer capacity is described by another parameter +#define _Out_z_cap_(size) _SAL1_1_Source_(_Out_z_cap_, (size), _Pre_cap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_(size) _SAL1_1_Source_(_Out_opt_z_cap_, (size), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_z_bytecap_(size) _SAL1_1_Source_(_Out_z_bytecap_, (size), _Pre_bytecap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_bytecap_(size) _SAL1_1_Source_(_Out_opt_z_bytecap_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_ _Post_z_) + +// buffer capacity is described by a constant expression +#define _Out_z_cap_c_(size) _SAL1_1_Source_(_Out_z_cap_c_, (size), _Pre_cap_c_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_c_(size) _SAL1_1_Source_(_Out_opt_z_cap_c_, (size), _Pre_opt_cap_c_(size) _Post_valid_impl_ _Post_z_) +#define _Out_z_bytecap_c_(size) _SAL1_1_Source_(_Out_z_bytecap_c_, (size), _Pre_bytecap_c_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Out_opt_z_bytecap_c_, (size), _Pre_opt_bytecap_c_(size) _Post_valid_impl_ _Post_z_) + +// buffer capacity is described by a complex expression +#define _Out_z_cap_x_(size) _SAL1_1_Source_(_Out_z_cap_x_, (size), _Pre_cap_x_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_x_(size) _SAL1_1_Source_(_Out_opt_z_cap_x_, (size), _Pre_opt_cap_x_(size) _Post_valid_impl_ _Post_z_) +#define _Out_z_bytecap_x_(size) _SAL1_1_Source_(_Out_z_bytecap_x_, (size), _Pre_bytecap_x_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Out_opt_z_bytecap_x_, (size), _Pre_opt_bytecap_x_(size) _Post_valid_impl_ _Post_z_) + +// a zero terminated string is filled into a buffer of given capacity +// e.g. size_t CopyCharRange( _In_count_(cchFrom) const char* rgchFrom, size_t cchFrom, _Out_cap_post_count_(cchTo,return)) char* rgchTo, size_t cchTo ); +#define _Out_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_cap_post_count_, (cap,count), _Pre_cap_(cap) _Post_valid_impl_ _Post_count_(count)) +#define _Out_opt_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_opt_cap_post_count_, (cap,count), _Pre_opt_cap_(cap) _Post_valid_impl_ _Post_count_(count)) +#define _Out_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_bytecap_post_bytecount_, (cap,count), _Pre_bytecap_(cap) _Post_valid_impl_ _Post_bytecount_(count)) +#define _Out_opt_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_opt_bytecap_post_bytecount_, (cap,count), _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_bytecount_(count)) + +// a zero terminated string is filled into a buffer of given capacity +// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Out_z_cap_post_count_(cchTo,return+1) char* szTo, size_t cchTo ); +#define _Out_z_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_z_cap_post_count_, (cap,count), _Pre_cap_(cap) _Post_valid_impl_ _Post_z_count_(count)) +#define _Out_opt_z_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_opt_z_cap_post_count_, (cap,count), _Pre_opt_cap_(cap) _Post_valid_impl_ _Post_z_count_(count)) +#define _Out_z_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_z_bytecap_post_bytecount_, (cap,count), _Pre_bytecap_(cap) _Post_valid_impl_ _Post_z_bytecount_(count)) +#define _Out_opt_z_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_opt_z_bytecap_post_bytecount_, (cap,count), _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_z_bytecount_(count)) + +// only use with dereferenced arguments e.g. '*pcch' +#define _Out_capcount_(capcount) _SAL1_1_Source_(_Out_capcount_, (capcount), _Pre_cap_(capcount) _Post_valid_impl_ _Post_count_(capcount)) +#define _Out_opt_capcount_(capcount) _SAL1_1_Source_(_Out_opt_capcount_, (capcount), _Pre_opt_cap_(capcount) _Post_valid_impl_ _Post_count_(capcount)) +#define _Out_bytecapcount_(capcount) _SAL1_1_Source_(_Out_bytecapcount_, (capcount), _Pre_bytecap_(capcount) _Post_valid_impl_ _Post_bytecount_(capcount)) +#define _Out_opt_bytecapcount_(capcount) _SAL1_1_Source_(_Out_opt_bytecapcount_, (capcount), _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_bytecount_(capcount)) + +#define _Out_capcount_x_(capcount) _SAL1_1_Source_(_Out_capcount_x_, (capcount), _Pre_cap_x_(capcount) _Post_valid_impl_ _Post_count_x_(capcount)) +#define _Out_opt_capcount_x_(capcount) _SAL1_1_Source_(_Out_opt_capcount_x_, (capcount), _Pre_opt_cap_x_(capcount) _Post_valid_impl_ _Post_count_x_(capcount)) +#define _Out_bytecapcount_x_(capcount) _SAL1_1_Source_(_Out_bytecapcount_x_, (capcount), _Pre_bytecap_x_(capcount) _Post_valid_impl_ _Post_bytecount_x_(capcount)) +#define _Out_opt_bytecapcount_x_(capcount) _SAL1_1_Source_(_Out_opt_bytecapcount_x_, (capcount), _Pre_opt_bytecap_x_(capcount) _Post_valid_impl_ _Post_bytecount_x_(capcount)) + +// e.g. GetString( _Out_z_capcount_(*pLen+1) char* sz, size_t* pLen ); +#define _Out_z_capcount_(capcount) _SAL1_1_Source_(_Out_z_capcount_, (capcount), _Pre_cap_(capcount) _Post_valid_impl_ _Post_z_count_(capcount)) +#define _Out_opt_z_capcount_(capcount) _SAL1_1_Source_(_Out_opt_z_capcount_, (capcount), _Pre_opt_cap_(capcount) _Post_valid_impl_ _Post_z_count_(capcount)) +#define _Out_z_bytecapcount_(capcount) _SAL1_1_Source_(_Out_z_bytecapcount_, (capcount), _Pre_bytecap_(capcount) _Post_valid_impl_ _Post_z_bytecount_(capcount)) +#define _Out_opt_z_bytecapcount_(capcount) _SAL1_1_Source_(_Out_opt_z_bytecapcount_, (capcount), _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_z_bytecount_(capcount)) + + +// 'inout' buffers with initialized elements before and after the call +// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndices, size_t cIndices ); +#define _Inout_count_(size) _SAL1_1_Source_(_Inout_count_, (size), _Prepost_count_(size)) +#define _Inout_opt_count_(size) _SAL1_1_Source_(_Inout_opt_count_, (size), _Prepost_opt_count_(size)) +#define _Inout_bytecount_(size) _SAL1_1_Source_(_Inout_bytecount_, (size), _Prepost_bytecount_(size)) +#define _Inout_opt_bytecount_(size) _SAL1_1_Source_(_Inout_opt_bytecount_, (size), _Prepost_opt_bytecount_(size)) + +#define _Inout_count_c_(size) _SAL1_1_Source_(_Inout_count_c_, (size), _Prepost_count_c_(size)) +#define _Inout_opt_count_c_(size) _SAL1_1_Source_(_Inout_opt_count_c_, (size), _Prepost_opt_count_c_(size)) +#define _Inout_bytecount_c_(size) _SAL1_1_Source_(_Inout_bytecount_c_, (size), _Prepost_bytecount_c_(size)) +#define _Inout_opt_bytecount_c_(size) _SAL1_1_Source_(_Inout_opt_bytecount_c_, (size), _Prepost_opt_bytecount_c_(size)) + +// nullterminated 'inout' buffers with initialized elements before and after the call +// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndices, size_t cIndices ); +#define _Inout_z_count_(size) _SAL1_1_Source_(_Inout_z_count_, (size), _Prepost_z_ _Prepost_count_(size)) +#define _Inout_opt_z_count_(size) _SAL1_1_Source_(_Inout_opt_z_count_, (size), _Prepost_z_ _Prepost_opt_count_(size)) +#define _Inout_z_bytecount_(size) _SAL1_1_Source_(_Inout_z_bytecount_, (size), _Prepost_z_ _Prepost_bytecount_(size)) +#define _Inout_opt_z_bytecount_(size) _SAL1_1_Source_(_Inout_opt_z_bytecount_, (size), _Prepost_z_ _Prepost_opt_bytecount_(size)) + +#define _Inout_z_count_c_(size) _SAL1_1_Source_(_Inout_z_count_c_, (size), _Prepost_z_ _Prepost_count_c_(size)) +#define _Inout_opt_z_count_c_(size) _SAL1_1_Source_(_Inout_opt_z_count_c_, (size), _Prepost_z_ _Prepost_opt_count_c_(size)) +#define _Inout_z_bytecount_c_(size) _SAL1_1_Source_(_Inout_z_bytecount_c_, (size), _Prepost_z_ _Prepost_bytecount_c_(size)) +#define _Inout_opt_z_bytecount_c_(size) _SAL1_1_Source_(_Inout_opt_z_bytecount_c_, (size), _Prepost_z_ _Prepost_opt_bytecount_c_(size)) + +#define _Inout_ptrdiff_count_(size) _SAL1_1_Source_(_Inout_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size)) +#define _Inout_opt_ptrdiff_count_(size) _SAL1_1_Source_(_Inout_opt_ptrdiff_count_, (size), _Pre_opt_ptrdiff_count_(size)) + +#define _Inout_count_x_(size) _SAL1_1_Source_(_Inout_count_x_, (size), _Prepost_count_x_(size)) +#define _Inout_opt_count_x_(size) _SAL1_1_Source_(_Inout_opt_count_x_, (size), _Prepost_opt_count_x_(size)) +#define _Inout_bytecount_x_(size) _SAL1_1_Source_(_Inout_bytecount_x_, (size), _Prepost_bytecount_x_(size)) +#define _Inout_opt_bytecount_x_(size) _SAL1_1_Source_(_Inout_opt_bytecount_x_, (size), _Prepost_opt_bytecount_x_(size)) + +// e.g. void AppendToLPSTR( _In_ LPCSTR szFrom, _Inout_cap_(cchTo) LPSTR* szTo, size_t cchTo ); +#define _Inout_cap_(size) _SAL1_1_Source_(_Inout_cap_, (size), _Pre_valid_cap_(size) _Post_valid_) +#define _Inout_opt_cap_(size) _SAL1_1_Source_(_Inout_opt_cap_, (size), _Pre_opt_valid_cap_(size) _Post_valid_) +#define _Inout_bytecap_(size) _SAL1_1_Source_(_Inout_bytecap_, (size), _Pre_valid_bytecap_(size) _Post_valid_) +#define _Inout_opt_bytecap_(size) _SAL1_1_Source_(_Inout_opt_bytecap_, (size), _Pre_opt_valid_bytecap_(size) _Post_valid_) + +#define _Inout_cap_c_(size) _SAL1_1_Source_(_Inout_cap_c_, (size), _Pre_valid_cap_c_(size) _Post_valid_) +#define _Inout_opt_cap_c_(size) _SAL1_1_Source_(_Inout_opt_cap_c_, (size), _Pre_opt_valid_cap_c_(size) _Post_valid_) +#define _Inout_bytecap_c_(size) _SAL1_1_Source_(_Inout_bytecap_c_, (size), _Pre_valid_bytecap_c_(size) _Post_valid_) +#define _Inout_opt_bytecap_c_(size) _SAL1_1_Source_(_Inout_opt_bytecap_c_, (size), _Pre_opt_valid_bytecap_c_(size) _Post_valid_) + +#define _Inout_cap_x_(size) _SAL1_1_Source_(_Inout_cap_x_, (size), _Pre_valid_cap_x_(size) _Post_valid_) +#define _Inout_opt_cap_x_(size) _SAL1_1_Source_(_Inout_opt_cap_x_, (size), _Pre_opt_valid_cap_x_(size) _Post_valid_) +#define _Inout_bytecap_x_(size) _SAL1_1_Source_(_Inout_bytecap_x_, (size), _Pre_valid_bytecap_x_(size) _Post_valid_) +#define _Inout_opt_bytecap_x_(size) _SAL1_1_Source_(_Inout_opt_bytecap_x_, (size), _Pre_opt_valid_bytecap_x_(size) _Post_valid_) + +// inout string buffers with writable size +// e.g. void AppendStr( _In_z_ const char* szFrom, _Inout_z_cap_(cchTo) char* szTo, size_t cchTo ); +#define _Inout_z_cap_(size) _SAL1_1_Source_(_Inout_z_cap_, (size), _Pre_z_cap_(size) _Post_z_) +#define _Inout_opt_z_cap_(size) _SAL1_1_Source_(_Inout_opt_z_cap_, (size), _Pre_opt_z_cap_(size) _Post_z_) +#define _Inout_z_bytecap_(size) _SAL1_1_Source_(_Inout_z_bytecap_, (size), _Pre_z_bytecap_(size) _Post_z_) +#define _Inout_opt_z_bytecap_(size) _SAL1_1_Source_(_Inout_opt_z_bytecap_, (size), _Pre_opt_z_bytecap_(size) _Post_z_) + +#define _Inout_z_cap_c_(size) _SAL1_1_Source_(_Inout_z_cap_c_, (size), _Pre_z_cap_c_(size) _Post_z_) +#define _Inout_opt_z_cap_c_(size) _SAL1_1_Source_(_Inout_opt_z_cap_c_, (size), _Pre_opt_z_cap_c_(size) _Post_z_) +#define _Inout_z_bytecap_c_(size) _SAL1_1_Source_(_Inout_z_bytecap_c_, (size), _Pre_z_bytecap_c_(size) _Post_z_) +#define _Inout_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Inout_opt_z_bytecap_c_, (size), _Pre_opt_z_bytecap_c_(size) _Post_z_) + +#define _Inout_z_cap_x_(size) _SAL1_1_Source_(_Inout_z_cap_x_, (size), _Pre_z_cap_x_(size) _Post_z_) +#define _Inout_opt_z_cap_x_(size) _SAL1_1_Source_(_Inout_opt_z_cap_x_, (size), _Pre_opt_z_cap_x_(size) _Post_z_) +#define _Inout_z_bytecap_x_(size) _SAL1_1_Source_(_Inout_z_bytecap_x_, (size), _Pre_z_bytecap_x_(size) _Post_z_) +#define _Inout_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Inout_opt_z_bytecap_x_, (size), _Pre_opt_z_bytecap_x_(size) _Post_z_) + + +// returning pointers to valid objects +#define _Ret_ _SAL1_1_Source_(_Ret_, (), _Ret_valid_) +#define _Ret_opt_ _SAL1_1_Source_(_Ret_opt_, (), _Ret_opt_valid_) + +// annotations to express 'boundedness' of integral value parameter +#define _In_bound_ _SAL1_1_Source_(_In_bound_, (), _In_bound_impl_) +#define _Out_bound_ _SAL1_1_Source_(_Out_bound_, (), _Out_bound_impl_) +#define _Ret_bound_ _SAL1_1_Source_(_Ret_bound_, (), _Ret_bound_impl_) +#define _Deref_in_bound_ _SAL1_1_Source_(_Deref_in_bound_, (), _Deref_in_bound_impl_) +#define _Deref_out_bound_ _SAL1_1_Source_(_Deref_out_bound_, (), _Deref_out_bound_impl_) +#define _Deref_inout_bound_ _SAL1_1_Source_(_Deref_inout_bound_, (), _Deref_in_bound_ _Deref_out_bound_) +#define _Deref_ret_bound_ _SAL1_1_Source_(_Deref_ret_bound_, (), _Deref_ret_bound_impl_) + +// e.g. HRESULT HrCreatePoint( _Deref_out_opt_ POINT** ppPT ); +#define _Deref_out_ _SAL1_1_Source_(_Deref_out_, (), _Out_ _Deref_post_valid_) +#define _Deref_out_opt_ _SAL1_1_Source_(_Deref_out_opt_, (), _Out_ _Deref_post_opt_valid_) +#define _Deref_opt_out_ _SAL1_1_Source_(_Deref_opt_out_, (), _Out_opt_ _Deref_post_valid_) +#define _Deref_opt_out_opt_ _SAL1_1_Source_(_Deref_opt_out_opt_, (), _Out_opt_ _Deref_post_opt_valid_) + +// e.g. void CloneString( _In_z_ const WCHAR* wzFrom, _Deref_out_z_ WCHAR** pWzTo ); +#define _Deref_out_z_ _SAL1_1_Source_(_Deref_out_z_, (), _Out_ _Deref_post_z_) +#define _Deref_out_opt_z_ _SAL1_1_Source_(_Deref_out_opt_z_, (), _Out_ _Deref_post_opt_z_) +#define _Deref_opt_out_z_ _SAL1_1_Source_(_Deref_opt_out_z_, (), _Out_opt_ _Deref_post_z_) +#define _Deref_opt_out_opt_z_ _SAL1_1_Source_(_Deref_opt_out_opt_z_, (), _Out_opt_ _Deref_post_opt_z_) + +// +// _Deref_pre_ --- +// +// describing conditions for array elements of dereferenced pointer parameters that must be met before the call + +// e.g. void SaveStringArray( _In_count_(cStrings) _Deref_pre_z_ const WCHAR* const rgpwch[] ); +#define _Deref_pre_z_ _SAL1_1_Source_(_Deref_pre_z_, (), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__zterm_impl) _Pre_valid_impl_) +#define _Deref_pre_opt_z_ _SAL1_1_Source_(_Deref_pre_opt_z_, (), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__zterm_impl) _Pre_valid_impl_) + +// e.g. void FillInArrayOfStr32( _In_count_(cStrings) _Deref_pre_cap_c_(32) _Deref_post_z_ WCHAR* const rgpwch[] ); +// buffer capacity is described by another parameter +#define _Deref_pre_cap_(size) _SAL1_1_Source_(_Deref_pre_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_impl(size))) +#define _Deref_pre_opt_cap_(size) _SAL1_1_Source_(_Deref_pre_opt_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_impl(size))) +#define _Deref_pre_bytecap_(size) _SAL1_1_Source_(_Deref_pre_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size))) +#define _Deref_pre_opt_bytecap_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size))) + +// buffer capacity is described by a constant expression +#define _Deref_pre_cap_c_(size) _SAL1_1_Source_(_Deref_pre_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size))) +#define _Deref_pre_opt_cap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size))) +#define _Deref_pre_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size))) +#define _Deref_pre_opt_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size))) + +// buffer capacity is described by a complex condition +#define _Deref_pre_cap_x_(size) _SAL1_1_Source_(_Deref_pre_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size))) +#define _Deref_pre_opt_cap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size))) +#define _Deref_pre_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size))) +#define _Deref_pre_opt_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size))) + +// convenience macros for nullterminated buffers with given capacity +#define _Deref_pre_z_cap_(size) _SAL1_1_Source_(_Deref_pre_z_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_cap_(size) _SAL1_1_Source_(_Deref_pre_opt_z_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_z_bytecap_(size) _SAL1_1_Source_(_Deref_pre_z_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_bytecap_(size) _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_z_cap_c_(size) _SAL1_1_Source_(_Deref_pre_z_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_cap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_z_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_z_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_z_cap_x_(size) _SAL1_1_Source_(_Deref_pre_z_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_cap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_z_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_z_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) + +// known capacity and valid but unknown readable extent +#define _Deref_pre_valid_cap_(size) _SAL1_1_Source_(_Deref_pre_valid_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_cap_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_valid_bytecap_(size) _SAL1_1_Source_(_Deref_pre_valid_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_bytecap_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_valid_cap_c_(size) _SAL1_1_Source_(_Deref_pre_valid_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_cap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_valid_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_valid_cap_x_(size) _SAL1_1_Source_(_Deref_pre_valid_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_cap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_valid_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) + +// e.g. void SaveMatrix( _In_count_(n) _Deref_pre_count_(n) const Elem** matrix, size_t n ); +// valid buffer extent is described by another parameter +#define _Deref_pre_count_(size) _SAL1_1_Source_(_Deref_pre_count_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_count_(size) _SAL1_1_Source_(_Deref_pre_opt_count_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_bytecount_(size) _SAL1_1_Source_(_Deref_pre_bytecount_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_bytecount_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecount_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) + +// valid buffer extent is described by a constant expression +#define _Deref_pre_count_c_(size) _SAL1_1_Source_(_Deref_pre_count_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_count_c_(size) _SAL1_1_Source_(_Deref_pre_opt_count_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_bytecount_c_(size) _SAL1_1_Source_(_Deref_pre_bytecount_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_bytecount_c_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecount_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) + +// valid buffer extent is described by a complex expression +#define _Deref_pre_count_x_(size) _SAL1_1_Source_(_Deref_pre_count_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_count_x_(size) _SAL1_1_Source_(_Deref_pre_opt_count_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_bytecount_x_(size) _SAL1_1_Source_(_Deref_pre_bytecount_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_bytecount_x_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecount_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) + +// e.g. void PrintStringArray( _In_count_(cElems) _Deref_pre_valid_ LPCSTR rgStr[], size_t cElems ); +#define _Deref_pre_valid_ _SAL1_1_Source_(_Deref_pre_valid_, (), _Deref_pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_ _SAL1_1_Source_(_Deref_pre_opt_valid_, (), _Deref_pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_) +#define _Deref_pre_invalid_ _SAL1_1_Source_(_Deref_pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl)) + +#define _Deref_pre_notnull_ _SAL1_1_Source_(_Deref_pre_notnull_, (), _Deref_pre1_impl_(__notnull_impl_notref)) +#define _Deref_pre_maybenull_ _SAL1_1_Source_(_Deref_pre_maybenull_, (), _Deref_pre1_impl_(__maybenull_impl_notref)) +#define _Deref_pre_null_ _SAL1_1_Source_(_Deref_pre_null_, (), _Deref_pre1_impl_(__null_impl_notref)) + +// restrict access rights +#define _Deref_pre_readonly_ _SAL1_1_Source_(_Deref_pre_readonly_, (), _Deref_pre1_impl_(__readaccess_impl_notref)) +#define _Deref_pre_writeonly_ _SAL1_1_Source_(_Deref_pre_writeonly_, (), _Deref_pre1_impl_(__writeaccess_impl_notref)) + +// +// _Deref_post_ --- +// +// describing conditions for array elements or dereferenced pointer parameters that hold after the call + +// e.g. void CloneString( _In_z_ const Wchar_t* wzIn _Out_ _Deref_post_z_ WCHAR** pWzOut ); +#define _Deref_post_z_ _SAL1_1_Source_(_Deref_post_z_, (), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__zterm_impl) _Post_valid_impl_) +#define _Deref_post_opt_z_ _SAL1_1_Source_(_Deref_post_opt_z_, (), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__zterm_impl) _Post_valid_impl_) + +// e.g. HRESULT HrAllocateMemory( size_t cb, _Out_ _Deref_post_bytecap_(cb) void** ppv ); +// buffer capacity is described by another parameter +#define _Deref_post_cap_(size) _SAL1_1_Source_(_Deref_post_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_impl(size))) +#define _Deref_post_opt_cap_(size) _SAL1_1_Source_(_Deref_post_opt_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_impl(size))) +#define _Deref_post_bytecap_(size) _SAL1_1_Source_(_Deref_post_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size))) +#define _Deref_post_opt_bytecap_(size) _SAL1_1_Source_(_Deref_post_opt_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size))) + +// buffer capacity is described by a constant expression +#define _Deref_post_cap_c_(size) _SAL1_1_Source_(_Deref_post_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size))) +#define _Deref_post_opt_cap_c_(size) _SAL1_1_Source_(_Deref_post_opt_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size))) +#define _Deref_post_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size))) +#define _Deref_post_opt_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_opt_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size))) + +// buffer capacity is described by a complex expression +#define _Deref_post_cap_x_(size) _SAL1_1_Source_(_Deref_post_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size))) +#define _Deref_post_opt_cap_x_(size) _SAL1_1_Source_(_Deref_post_opt_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size))) +#define _Deref_post_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size))) +#define _Deref_post_opt_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_opt_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size))) + +// convenience macros for nullterminated buffers with given capacity +#define _Deref_post_z_cap_(size) _SAL1_1_Source_(_Deref_post_z_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_cap_(size) _SAL1_1_Source_(_Deref_post_opt_z_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_z_bytecap_(size) _SAL1_1_Source_(_Deref_post_z_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_bytecap_(size) _SAL1_1_Source_(_Deref_post_opt_z_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_impl(size)) _Post_valid_impl_) + +#define _Deref_post_z_cap_c_(size) _SAL1_1_Source_(_Deref_post_z_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_cap_c_(size) _SAL1_1_Source_(_Deref_post_opt_z_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_z_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_opt_z_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Post_valid_impl_) + +#define _Deref_post_z_cap_x_(size) _SAL1_1_Source_(_Deref_post_z_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_cap_x_(size) _SAL1_1_Source_(_Deref_post_opt_z_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_z_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_opt_z_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Post_valid_impl_) + +// known capacity and valid but unknown readable extent +#define _Deref_post_valid_cap_(size) _SAL1_1_Source_(_Deref_post_valid_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_cap_(size) _SAL1_1_Source_(_Deref_post_opt_valid_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_valid_bytecap_(size) _SAL1_1_Source_(_Deref_post_valid_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_bytecap_(size) _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size)) _Post_valid_impl_) + +#define _Deref_post_valid_cap_c_(size) _SAL1_1_Source_(_Deref_post_valid_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_cap_c_(size) _SAL1_1_Source_(_Deref_post_opt_valid_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_valid_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_) + +#define _Deref_post_valid_cap_x_(size) _SAL1_1_Source_(_Deref_post_valid_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_cap_x_(size) _SAL1_1_Source_(_Deref_post_opt_valid_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_valid_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_) + +// e.g. HRESULT HrAllocateZeroInitializedMemory( size_t cb, _Out_ _Deref_post_bytecount_(cb) void** ppv ); +// valid buffer extent is described by another parameter +#define _Deref_post_count_(size) _SAL1_1_Source_(_Deref_post_count_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_count_(size) _SAL1_1_Source_(_Deref_post_opt_count_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Deref_post_bytecount_(size) _SAL1_1_Source_(_Deref_post_bytecount_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_bytecount_(size) _SAL1_1_Source_(_Deref_post_opt_bytecount_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) + +// buffer capacity is described by a constant expression +#define _Deref_post_count_c_(size) _SAL1_1_Source_(_Deref_post_count_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_count_c_(size) _SAL1_1_Source_(_Deref_post_opt_count_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_bytecount_c_(size) _SAL1_1_Source_(_Deref_post_bytecount_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_bytecount_c_(size) _SAL1_1_Source_(_Deref_post_opt_bytecount_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_) + +// buffer capacity is described by a complex expression +#define _Deref_post_count_x_(size) _SAL1_1_Source_(_Deref_post_count_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_count_x_(size) _SAL1_1_Source_(_Deref_post_opt_count_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_bytecount_x_(size) _SAL1_1_Source_(_Deref_post_bytecount_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_bytecount_x_(size) _SAL1_1_Source_(_Deref_post_opt_bytecount_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_) + +// e.g. void GetStrings( _Out_count_(cElems) _Deref_post_valid_ LPSTR const rgStr[], size_t cElems ); +#define _Deref_post_valid_ _SAL1_1_Source_(_Deref_post_valid_, (), _Deref_post1_impl_(__notnull_impl_notref) _Post_valid_impl_) +#define _Deref_post_opt_valid_ _SAL1_1_Source_(_Deref_post_opt_valid_, (), _Deref_post1_impl_(__maybenull_impl_notref) _Post_valid_impl_) + +#define _Deref_post_notnull_ _SAL1_1_Source_(_Deref_post_notnull_, (), _Deref_post1_impl_(__notnull_impl_notref)) +#define _Deref_post_maybenull_ _SAL1_1_Source_(_Deref_post_maybenull_, (), _Deref_post1_impl_(__maybenull_impl_notref)) +#define _Deref_post_null_ _SAL1_1_Source_(_Deref_post_null_, (), _Deref_post1_impl_(__null_impl_notref)) + +// +// _Deref_ret_ --- +// + +#define _Deref_ret_z_ _SAL1_1_Source_(_Deref_ret_z_, (), _Deref_ret1_impl_(__notnull_impl_notref) _Deref_ret1_impl_(__zterm_impl)) +#define _Deref_ret_opt_z_ _SAL1_1_Source_(_Deref_ret_opt_z_, (), _Deref_ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__zterm_impl)) + +// +// special _Deref_ --- +// +#define _Deref2_pre_readonly_ _SAL1_1_Source_(_Deref2_pre_readonly_, (), _Deref2_pre1_impl_(__readaccess_impl_notref)) + +// +// _Ret_ --- +// + +// e.g. _Ret_opt_valid_ LPSTR void* CloneSTR( _Pre_valid_ LPSTR src ); +#define _Ret_opt_valid_ _SAL1_1_Source_(_Ret_opt_valid_, (), _Ret1_impl_(__maybenull_impl_notref) _Ret_valid_impl_) +#define _Ret_opt_z_ _SAL1_1_Source_(_Ret_opt_z_, (), _Ret2_impl_(__maybenull_impl,__zterm_impl) _Ret_valid_impl_) + +// e.g. _Ret_opt_bytecap_(cb) void* AllocateMemory( size_t cb ); +// Buffer capacity is described by another parameter +#define _Ret_cap_(size) _SAL1_1_Source_(_Ret_cap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_impl(size))) +#define _Ret_opt_cap_(size) _SAL1_1_Source_(_Ret_opt_cap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_impl(size))) +#define _Ret_bytecap_(size) _SAL1_1_Source_(_Ret_bytecap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_impl(size))) +#define _Ret_opt_bytecap_(size) _SAL1_1_Source_(_Ret_opt_bytecap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_impl(size))) + +// Buffer capacity is described by a constant expression +#define _Ret_cap_c_(size) _SAL1_1_Source_(_Ret_cap_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_c_impl(size))) +#define _Ret_opt_cap_c_(size) _SAL1_1_Source_(_Ret_opt_cap_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_c_impl(size))) +#define _Ret_bytecap_c_(size) _SAL1_1_Source_(_Ret_bytecap_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size))) +#define _Ret_opt_bytecap_c_(size) _SAL1_1_Source_(_Ret_opt_bytecap_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size))) + +// Buffer capacity is described by a complex condition +#define _Ret_cap_x_(size) _SAL1_1_Source_(_Ret_cap_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_x_impl(size))) +#define _Ret_opt_cap_x_(size) _SAL1_1_Source_(_Ret_opt_cap_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_x_impl(size))) +#define _Ret_bytecap_x_(size) _SAL1_1_Source_(_Ret_bytecap_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size))) +#define _Ret_opt_bytecap_x_(size) _SAL1_1_Source_(_Ret_opt_bytecap_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size))) + +// return value is nullterminated and capacity is given by another parameter +#define _Ret_z_cap_(size) _SAL1_1_Source_(_Ret_z_cap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__cap_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_cap_(size) _SAL1_1_Source_(_Ret_opt_z_cap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__cap_impl(size)) _Ret_valid_impl_) +#define _Ret_z_bytecap_(size) _SAL1_1_Source_(_Ret_z_bytecap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecap_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_bytecap_(size) _SAL1_1_Source_(_Ret_opt_z_bytecap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecap_impl(size)) _Ret_valid_impl_) + +// e.g. _Ret_opt_bytecount_(cb) void* AllocateZeroInitializedMemory( size_t cb ); +// Valid Buffer extent is described by another parameter +#define _Ret_count_(size) _SAL1_1_Source_(_Ret_count_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_count_(size) _SAL1_1_Source_(_Ret_opt_count_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_impl(size)) _Ret_valid_impl_) +#define _Ret_bytecount_(size) _SAL1_1_Source_(_Ret_bytecount_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_bytecount_(size) _SAL1_1_Source_(_Ret_opt_bytecount_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) _Ret_valid_impl_) + +// Valid Buffer extent is described by a constant expression +#define _Ret_count_c_(size) _SAL1_1_Source_(_Ret_count_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_c_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_count_c_(size) _SAL1_1_Source_(_Ret_opt_count_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_c_impl(size)) _Ret_valid_impl_) +#define _Ret_bytecount_c_(size) _SAL1_1_Source_(_Ret_bytecount_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_bytecount_c_(size) _SAL1_1_Source_(_Ret_opt_bytecount_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) _Ret_valid_impl_) + +// Valid Buffer extent is described by a complex expression +#define _Ret_count_x_(size) _SAL1_1_Source_(_Ret_count_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_x_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_count_x_(size) _SAL1_1_Source_(_Ret_opt_count_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_x_impl(size)) _Ret_valid_impl_) +#define _Ret_bytecount_x_(size) _SAL1_1_Source_(_Ret_bytecount_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_bytecount_x_(size) _SAL1_1_Source_(_Ret_opt_bytecount_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) _Ret_valid_impl_) + +// return value is nullterminated and length is given by another parameter +#define _Ret_z_count_(size) _SAL1_1_Source_(_Ret_z_count_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__count_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_count_(size) _SAL1_1_Source_(_Ret_opt_z_count_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__count_impl(size)) _Ret_valid_impl_) +#define _Ret_z_bytecount_(size) _SAL1_1_Source_(_Ret_z_bytecount_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecount_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_bytecount_(size) _SAL1_1_Source_(_Ret_opt_z_bytecount_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecount_impl(size)) _Ret_valid_impl_) + + +// _Pre_ annotations --- +#define _Pre_opt_z_ _SAL1_1_Source_(_Pre_opt_z_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__zterm_impl) _Pre_valid_impl_) + +// restrict access rights +#define _Pre_readonly_ _SAL1_1_Source_(_Pre_readonly_, (), _Pre1_impl_(__readaccess_impl_notref)) +#define _Pre_writeonly_ _SAL1_1_Source_(_Pre_writeonly_, (), _Pre1_impl_(__writeaccess_impl_notref)) + +// e.g. void FreeMemory( _Pre_bytecap_(cb) _Post_ptr_invalid_ void* pv, size_t cb ); +// buffer capacity described by another parameter +#define _Pre_cap_(size) _SAL1_1_Source_(_Pre_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size))) +#define _Pre_opt_cap_(size) _SAL1_1_Source_(_Pre_opt_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size))) +#define _Pre_bytecap_(size) _SAL1_1_Source_(_Pre_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size))) +#define _Pre_opt_bytecap_(size) _SAL1_1_Source_(_Pre_opt_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size))) + +// buffer capacity described by a constant expression +#define _Pre_cap_c_(size) _SAL1_1_Source_(_Pre_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size))) +#define _Pre_opt_cap_c_(size) _SAL1_1_Source_(_Pre_opt_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size))) +#define _Pre_bytecap_c_(size) _SAL1_1_Source_(_Pre_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size))) +#define _Pre_opt_bytecap_c_(size) _SAL1_1_Source_(_Pre_opt_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size))) +#define _Pre_cap_c_one_ _SAL1_1_Source_(_Pre_cap_c_one_, (), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl)) +#define _Pre_opt_cap_c_one_ _SAL1_1_Source_(_Pre_opt_cap_c_one_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl)) + +// buffer capacity is described by another parameter multiplied by a constant expression +#define _Pre_cap_m_(mult,size) _SAL1_1_Source_(_Pre_cap_m_, (mult,size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__mult_impl(mult,size))) +#define _Pre_opt_cap_m_(mult,size) _SAL1_1_Source_(_Pre_opt_cap_m_, (mult,size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__mult_impl(mult,size))) + +// buffer capacity described by size of other buffer, only used by dangerous legacy APIs +// e.g. int strcpy(_Pre_cap_for_(src) char* dst, const char* src); +#define _Pre_cap_for_(param) _SAL1_1_Source_(_Pre_cap_for_, (param), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_for_impl(param))) +#define _Pre_opt_cap_for_(param) _SAL1_1_Source_(_Pre_opt_cap_for_, (param), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_for_impl(param))) + +// buffer capacity described by a complex condition +#define _Pre_cap_x_(size) _SAL1_1_Source_(_Pre_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size))) +#define _Pre_opt_cap_x_(size) _SAL1_1_Source_(_Pre_opt_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size))) +#define _Pre_bytecap_x_(size) _SAL1_1_Source_(_Pre_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size))) +#define _Pre_opt_bytecap_x_(size) _SAL1_1_Source_(_Pre_opt_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size))) + +// buffer capacity described by the difference to another pointer parameter +#define _Pre_ptrdiff_cap_(ptr) _SAL1_1_Source_(_Pre_ptrdiff_cap_, (ptr), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr)))) +#define _Pre_opt_ptrdiff_cap_(ptr) _SAL1_1_Source_(_Pre_opt_ptrdiff_cap_, (ptr), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr)))) + +// e.g. void AppendStr( _Pre_z_ const char* szFrom, _Pre_z_cap_(cchTo) _Post_z_ char* szTo, size_t cchTo ); +#define _Pre_z_cap_(size) _SAL1_1_Source_(_Pre_z_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_cap_(size) _SAL1_1_Source_(_Pre_opt_z_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_z_bytecap_(size) _SAL1_1_Source_(_Pre_z_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_bytecap_(size) _SAL1_1_Source_(_Pre_opt_z_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Pre_z_cap_c_(size) _SAL1_1_Source_(_Pre_z_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_cap_c_(size) _SAL1_1_Source_(_Pre_opt_z_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_z_bytecap_c_(size) _SAL1_1_Source_(_Pre_z_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Pre_opt_z_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Pre_z_cap_x_(size) _SAL1_1_Source_(_Pre_z_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_cap_x_(size) _SAL1_1_Source_(_Pre_opt_z_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_z_bytecap_x_(size) _SAL1_1_Source_(_Pre_z_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Pre_opt_z_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) + +// known capacity and valid but unknown readable extent +#define _Pre_valid_cap_(size) _SAL1_1_Source_(_Pre_valid_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_cap_(size) _SAL1_1_Source_(_Pre_opt_valid_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_valid_bytecap_(size) _SAL1_1_Source_(_Pre_valid_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_bytecap_(size) _SAL1_1_Source_(_Pre_opt_valid_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Pre_valid_cap_c_(size) _SAL1_1_Source_(_Pre_valid_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_cap_c_(size) _SAL1_1_Source_(_Pre_opt_valid_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_valid_bytecap_c_(size) _SAL1_1_Source_(_Pre_valid_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_bytecap_c_(size) _SAL1_1_Source_(_Pre_opt_valid_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Pre_valid_cap_x_(size) _SAL1_1_Source_(_Pre_valid_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_cap_x_(size) _SAL1_1_Source_(_Pre_opt_valid_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_valid_bytecap_x_(size) _SAL1_1_Source_(_Pre_valid_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Pre_opt_valid_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) + +// e.g. void AppendCharRange( _Pre_count_(cchFrom) const char* rgFrom, size_t cchFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); +// Valid buffer extent described by another parameter +#define _Pre_count_(size) _SAL1_1_Source_(_Pre_count_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_count_(size) _SAL1_1_Source_(_Pre_opt_count_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Pre_bytecount_(size) _SAL1_1_Source_(_Pre_bytecount_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_bytecount_(size) _SAL1_1_Source_(_Pre_opt_bytecount_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) + +// Valid buffer extent described by a constant expression +#define _Pre_count_c_(size) _SAL1_1_Source_(_Pre_count_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_count_c_(size) _SAL1_1_Source_(_Pre_opt_count_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Pre_bytecount_c_(size) _SAL1_1_Source_(_Pre_bytecount_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_bytecount_c_(size) _SAL1_1_Source_(_Pre_opt_bytecount_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) + +// Valid buffer extent described by a complex expression +#define _Pre_count_x_(size) _SAL1_1_Source_(_Pre_count_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_count_x_(size) _SAL1_1_Source_(_Pre_opt_count_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Pre_bytecount_x_(size) _SAL1_1_Source_(_Pre_bytecount_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_bytecount_x_(size) _SAL1_1_Source_(_Pre_opt_bytecount_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) + +// Valid buffer extent described by the difference to another pointer parameter +#define _Pre_ptrdiff_count_(ptr) _SAL1_1_Source_(_Pre_ptrdiff_count_, (ptr), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) _Pre_valid_impl_) +#define _Pre_opt_ptrdiff_count_(ptr) _SAL1_1_Source_(_Pre_opt_ptrdiff_count_, (ptr), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) _Pre_valid_impl_) + + +// char * strncpy(_Out_cap_(_Count) _Post_maybez_ char * _Dest, _In_z_ const char * _Source, _In_ size_t _Count) +// buffer maybe zero-terminated after the call +#define _Post_maybez_ _SAL1_1_Source_(_Post_maybez_, (), _Post1_impl_(__maybezterm_impl)) + +// e.g. SIZE_T HeapSize( _In_ HANDLE hHeap, DWORD dwFlags, _Pre_notnull_ _Post_bytecap_(return) LPCVOID lpMem ); +#define _Post_cap_(size) _SAL1_1_Source_(_Post_cap_, (size), _Post1_impl_(__cap_impl(size))) +#define _Post_bytecap_(size) _SAL1_1_Source_(_Post_bytecap_, (size), _Post1_impl_(__bytecap_impl(size))) + +// e.g. int strlen( _In_z_ _Post_count_(return+1) const char* sz ); +#define _Post_count_(size) _SAL1_1_Source_(_Post_count_, (size), _Post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Post_bytecount_(size) _SAL1_1_Source_(_Post_bytecount_, (size), _Post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) +#define _Post_count_c_(size) _SAL1_1_Source_(_Post_count_c_, (size), _Post1_impl_(__count_c_impl(size)) _Post_valid_impl_) +#define _Post_bytecount_c_(size) _SAL1_1_Source_(_Post_bytecount_c_, (size), _Post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_) +#define _Post_count_x_(size) _SAL1_1_Source_(_Post_count_x_, (size), _Post1_impl_(__count_x_impl(size)) _Post_valid_impl_) +#define _Post_bytecount_x_(size) _SAL1_1_Source_(_Post_bytecount_x_, (size), _Post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_) + +// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_count_(return+1) char* szFrom, size_t cchFrom ); +#define _Post_z_count_(size) _SAL1_1_Source_(_Post_z_count_, (size), _Post2_impl_(__zterm_impl,__count_impl(size)) _Post_valid_impl_) +#define _Post_z_bytecount_(size) _SAL1_1_Source_(_Post_z_bytecount_, (size), _Post2_impl_(__zterm_impl,__bytecount_impl(size)) _Post_valid_impl_) +#define _Post_z_count_c_(size) _SAL1_1_Source_(_Post_z_count_c_, (size), _Post2_impl_(__zterm_impl,__count_c_impl(size)) _Post_valid_impl_) +#define _Post_z_bytecount_c_(size) _SAL1_1_Source_(_Post_z_bytecount_c_, (size), _Post2_impl_(__zterm_impl,__bytecount_c_impl(size)) _Post_valid_impl_) +#define _Post_z_count_x_(size) _SAL1_1_Source_(_Post_z_count_x_, (size), _Post2_impl_(__zterm_impl,__count_x_impl(size)) _Post_valid_impl_) +#define _Post_z_bytecount_x_(size) _SAL1_1_Source_(_Post_z_bytecount_x_, (size), _Post2_impl_(__zterm_impl,__bytecount_x_impl(size)) _Post_valid_impl_) + +// +// _Prepost_ --- +// +// describing conditions that hold before and after the function call + +#define _Prepost_opt_z_ _SAL1_1_Source_(_Prepost_opt_z_, (), _Pre_opt_z_ _Post_z_) + +#define _Prepost_count_(size) _SAL1_1_Source_(_Prepost_count_, (size), _Pre_count_(size) _Post_count_(size)) +#define _Prepost_opt_count_(size) _SAL1_1_Source_(_Prepost_opt_count_, (size), _Pre_opt_count_(size) _Post_count_(size)) +#define _Prepost_bytecount_(size) _SAL1_1_Source_(_Prepost_bytecount_, (size), _Pre_bytecount_(size) _Post_bytecount_(size)) +#define _Prepost_opt_bytecount_(size) _SAL1_1_Source_(_Prepost_opt_bytecount_, (size), _Pre_opt_bytecount_(size) _Post_bytecount_(size)) +#define _Prepost_count_c_(size) _SAL1_1_Source_(_Prepost_count_c_, (size), _Pre_count_c_(size) _Post_count_c_(size)) +#define _Prepost_opt_count_c_(size) _SAL1_1_Source_(_Prepost_opt_count_c_, (size), _Pre_opt_count_c_(size) _Post_count_c_(size)) +#define _Prepost_bytecount_c_(size) _SAL1_1_Source_(_Prepost_bytecount_c_, (size), _Pre_bytecount_c_(size) _Post_bytecount_c_(size)) +#define _Prepost_opt_bytecount_c_(size) _SAL1_1_Source_(_Prepost_opt_bytecount_c_, (size), _Pre_opt_bytecount_c_(size) _Post_bytecount_c_(size)) +#define _Prepost_count_x_(size) _SAL1_1_Source_(_Prepost_count_x_, (size), _Pre_count_x_(size) _Post_count_x_(size)) +#define _Prepost_opt_count_x_(size) _SAL1_1_Source_(_Prepost_opt_count_x_, (size), _Pre_opt_count_x_(size) _Post_count_x_(size)) +#define _Prepost_bytecount_x_(size) _SAL1_1_Source_(_Prepost_bytecount_x_, (size), _Pre_bytecount_x_(size) _Post_bytecount_x_(size)) +#define _Prepost_opt_bytecount_x_(size) _SAL1_1_Source_(_Prepost_opt_bytecount_x_, (size), _Pre_opt_bytecount_x_(size) _Post_bytecount_x_(size)) + +#define _Prepost_valid_ _SAL1_1_Source_(_Prepost_valid_, (), _Pre_valid_ _Post_valid_) +#define _Prepost_opt_valid_ _SAL1_1_Source_(_Prepost_opt_valid_, (), _Pre_opt_valid_ _Post_valid_) + +// +// _Deref_ --- +// +// short version for _Deref_pre_ _Deref_post_ +// describing conditions for array elements or dereferenced pointer parameters that hold before and after the call + +#define _Deref_prepost_z_ _SAL1_1_Source_(_Deref_prepost_z_, (), _Deref_pre_z_ _Deref_post_z_) +#define _Deref_prepost_opt_z_ _SAL1_1_Source_(_Deref_prepost_opt_z_, (), _Deref_pre_opt_z_ _Deref_post_opt_z_) + +#define _Deref_prepost_cap_(size) _SAL1_1_Source_(_Deref_prepost_cap_, (size), _Deref_pre_cap_(size) _Deref_post_cap_(size)) +#define _Deref_prepost_opt_cap_(size) _SAL1_1_Source_(_Deref_prepost_opt_cap_, (size), _Deref_pre_opt_cap_(size) _Deref_post_opt_cap_(size)) +#define _Deref_prepost_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_bytecap_, (size), _Deref_pre_bytecap_(size) _Deref_post_bytecap_(size)) +#define _Deref_prepost_opt_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecap_, (size), _Deref_pre_opt_bytecap_(size) _Deref_post_opt_bytecap_(size)) + +#define _Deref_prepost_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_cap_x_, (size), _Deref_pre_cap_x_(size) _Deref_post_cap_x_(size)) +#define _Deref_prepost_opt_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_cap_x_, (size), _Deref_pre_opt_cap_x_(size) _Deref_post_opt_cap_x_(size)) +#define _Deref_prepost_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_bytecap_x_, (size), _Deref_pre_bytecap_x_(size) _Deref_post_bytecap_x_(size)) +#define _Deref_prepost_opt_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecap_x_, (size), _Deref_pre_opt_bytecap_x_(size) _Deref_post_opt_bytecap_x_(size)) + +#define _Deref_prepost_z_cap_(size) _SAL1_1_Source_(_Deref_prepost_z_cap_, (size), _Deref_pre_z_cap_(size) _Deref_post_z_cap_(size)) +#define _Deref_prepost_opt_z_cap_(size) _SAL1_1_Source_(_Deref_prepost_opt_z_cap_, (size), _Deref_pre_opt_z_cap_(size) _Deref_post_opt_z_cap_(size)) +#define _Deref_prepost_z_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_z_bytecap_, (size), _Deref_pre_z_bytecap_(size) _Deref_post_z_bytecap_(size)) +#define _Deref_prepost_opt_z_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_opt_z_bytecap_, (size), _Deref_pre_opt_z_bytecap_(size) _Deref_post_opt_z_bytecap_(size)) + +#define _Deref_prepost_valid_cap_(size) _SAL1_1_Source_(_Deref_prepost_valid_cap_, (size), _Deref_pre_valid_cap_(size) _Deref_post_valid_cap_(size)) +#define _Deref_prepost_opt_valid_cap_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_, (size), _Deref_pre_opt_valid_cap_(size) _Deref_post_opt_valid_cap_(size)) +#define _Deref_prepost_valid_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_valid_bytecap_, (size), _Deref_pre_valid_bytecap_(size) _Deref_post_valid_bytecap_(size)) +#define _Deref_prepost_opt_valid_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_, (size), _Deref_pre_opt_valid_bytecap_(size) _Deref_post_opt_valid_bytecap_(size)) + +#define _Deref_prepost_valid_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_valid_cap_x_, (size), _Deref_pre_valid_cap_x_(size) _Deref_post_valid_cap_x_(size)) +#define _Deref_prepost_opt_valid_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_x_, (size), _Deref_pre_opt_valid_cap_x_(size) _Deref_post_opt_valid_cap_x_(size)) +#define _Deref_prepost_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_valid_bytecap_x_, (size), _Deref_pre_valid_bytecap_x_(size) _Deref_post_valid_bytecap_x_(size)) +#define _Deref_prepost_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_x_, (size), _Deref_pre_opt_valid_bytecap_x_(size) _Deref_post_opt_valid_bytecap_x_(size)) + +#define _Deref_prepost_count_(size) _SAL1_1_Source_(_Deref_prepost_count_, (size), _Deref_pre_count_(size) _Deref_post_count_(size)) +#define _Deref_prepost_opt_count_(size) _SAL1_1_Source_(_Deref_prepost_opt_count_, (size), _Deref_pre_opt_count_(size) _Deref_post_opt_count_(size)) +#define _Deref_prepost_bytecount_(size) _SAL1_1_Source_(_Deref_prepost_bytecount_, (size), _Deref_pre_bytecount_(size) _Deref_post_bytecount_(size)) +#define _Deref_prepost_opt_bytecount_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecount_, (size), _Deref_pre_opt_bytecount_(size) _Deref_post_opt_bytecount_(size)) + +#define _Deref_prepost_count_x_(size) _SAL1_1_Source_(_Deref_prepost_count_x_, (size), _Deref_pre_count_x_(size) _Deref_post_count_x_(size)) +#define _Deref_prepost_opt_count_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_count_x_, (size), _Deref_pre_opt_count_x_(size) _Deref_post_opt_count_x_(size)) +#define _Deref_prepost_bytecount_x_(size) _SAL1_1_Source_(_Deref_prepost_bytecount_x_, (size), _Deref_pre_bytecount_x_(size) _Deref_post_bytecount_x_(size)) +#define _Deref_prepost_opt_bytecount_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecount_x_, (size), _Deref_pre_opt_bytecount_x_(size) _Deref_post_opt_bytecount_x_(size)) + +#define _Deref_prepost_valid_ _SAL1_1_Source_(_Deref_prepost_valid_, (), _Deref_pre_valid_ _Deref_post_valid_) +#define _Deref_prepost_opt_valid_ _SAL1_1_Source_(_Deref_prepost_opt_valid_, (), _Deref_pre_opt_valid_ _Deref_post_opt_valid_) + +// +// _Deref_ +// +// used with references to arrays + +#define _Deref_out_z_cap_c_(size) _SAL1_1_Source_(_Deref_out_z_cap_c_, (size), _Deref_pre_cap_c_(size) _Deref_post_z_) +#define _Deref_inout_z_cap_c_(size) _SAL1_1_Source_(_Deref_inout_z_cap_c_, (size), _Deref_pre_z_cap_c_(size) _Deref_post_z_) +#define _Deref_out_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_out_z_bytecap_c_, (size), _Deref_pre_bytecap_c_(size) _Deref_post_z_) +#define _Deref_inout_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_inout_z_bytecap_c_, (size), _Deref_pre_z_bytecap_c_(size) _Deref_post_z_) +#define _Deref_inout_z_ _SAL1_1_Source_(_Deref_inout_z_, (), _Deref_prepost_z_) + +// #pragma endregion Input Buffer SAL 1 compatibility macros + + +//============================================================================ +// Implementation Layer: +//============================================================================ + + +// Naming conventions: +// A symbol the begins with _SA_ is for the machinery of creating any +// annotations; many of those come from sourceannotations.h in the case +// of attributes. + +// A symbol that ends with _impl is the very lowest level macro. It is +// not required to be a legal standalone annotation, and in the case +// of attribute annotations, usually is not. (In the case of some declspec +// annotations, it might be, but it should not be assumed so.) Those +// symols will be used in the _PreN..., _PostN... and _RetN... annotations +// to build up more complete annotations. + +// A symbol ending in _impl_ is reserved to the implementation as well, +// but it does form a complete annotation; usually they are used to build +// up even higher level annotations. + + +#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL // [ +// Sharable "_impl" macros: these can be shared between the various annotation +// forms but are part of the implementation of the macros. These are collected +// here to assure that only necessary differences in the annotations +// exist. + +#define _Always_impl_(annos) _Group_(annos _SAL_nop_impl_) _On_failure_impl_(annos _SAL_nop_impl_) +#define _Bound_impl_ _SA_annotes0(SAL_bound) +#define _Field_range_impl_(min,max) _Range_impl_(min,max) +#define _Literal_impl_ _SA_annotes1(SAL_constant, __yes) +#define _Maybenull_impl_ _SA_annotes1(SAL_null, __maybe) +#define _Maybevalid_impl_ _SA_annotes1(SAL_valid, __maybe) +#define _Must_inspect_impl_ _Post_impl_ _SA_annotes0(SAL_mustInspect) +#define _Notliteral_impl_ _SA_annotes1(SAL_constant, __no) +#define _Notnull_impl_ _SA_annotes1(SAL_null, __no) +#define _Notvalid_impl_ _SA_annotes1(SAL_valid, __no) +#define _NullNull_terminated_impl_ _Group_(_SA_annotes1(SAL_nullTerminated, __yes) _SA_annotes1(SAL_readableTo,inexpressibleCount("NullNull terminated string"))) +#define _Null_impl_ _SA_annotes1(SAL_null, __yes) +#define _Null_terminated_impl_ _SA_annotes1(SAL_nullTerminated, __yes) +#define _Out_impl_ _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_ +#define _Out_opt_impl_ _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_ +#define _Points_to_data_impl_ _At_(*_Curr_, _SA_annotes1(SAL_mayBePointer, __no)) +#define _Post_satisfies_impl_(cond) _Post_impl_ _Satisfies_impl_(cond) +#define _Post_valid_impl_ _Post1_impl_(__valid_impl) +#define _Pre_satisfies_impl_(cond) _Pre_impl_ _Satisfies_impl_(cond) +#define _Pre_valid_impl_ _Pre1_impl_(__valid_impl) +#define _Range_impl_(min,max) _SA_annotes2(SAL_range, min, max) +#define _Readable_bytes_impl_(size) _SA_annotes1(SAL_readableTo, byteCount(size)) +#define _Readable_elements_impl_(size) _SA_annotes1(SAL_readableTo, elementCount(size)) +#define _Ret_valid_impl_ _Ret1_impl_(__valid_impl) +#define _Satisfies_impl_(cond) _SA_annotes1(SAL_satisfies, cond) +#define _Valid_impl_ _SA_annotes1(SAL_valid, __yes) +#define _Writable_bytes_impl_(size) _SA_annotes1(SAL_writableTo, byteCount(size)) +#define _Writable_elements_impl_(size) _SA_annotes1(SAL_writableTo, elementCount(size)) + +#define _In_range_impl_(min,max) _Pre_impl_ _Range_impl_(min,max) +#define _Out_range_impl_(min,max) _Post_impl_ _Range_impl_(min,max) +#define _Ret_range_impl_(min,max) _Post_impl_ _Range_impl_(min,max) +#define _Deref_in_range_impl_(min,max) _Deref_pre_impl_ _Range_impl_(min,max) +#define _Deref_out_range_impl_(min,max) _Deref_post_impl_ _Range_impl_(min,max) +#define _Deref_ret_range_impl_(min,max) _Deref_post_impl_ _Range_impl_(min,max) + +#define _Deref_pre_impl_ _Pre_impl_ _Notref_impl_ _Deref_impl_ +#define _Deref_post_impl_ _Post_impl_ _Notref_impl_ _Deref_impl_ + +// The following are for the implementation machinery, and are not +// suitable for annotating general code. +// We're tying to phase this out, someday. The parser quotes the param. +#define __AuToQuOtE _SA_annotes0(SAL_AuToQuOtE) + +// Normally the parser does some simple type checking of annotation params, +// defer that check to the plugin. +#define __deferTypecheck _SA_annotes0(SAL_deferTypecheck) + +#define _SA_SPECSTRIZE( x ) #x +#define _SAL_nop_impl_ /* nothing */ +#define __nop_impl(x) x +#endif + + +#if _USE_ATTRIBUTES_FOR_SAL // [ + +// Using attributes for sal + +#include "codeanalysis\sourceannotations.h" + + +#define _SA_annotes0(n) [SAL_annotes(Name=#n)] +#define _SA_annotes1(n,pp1) [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1))] +#define _SA_annotes2(n,pp1,pp2) [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1), p2=_SA_SPECSTRIZE(pp2))] +#define _SA_annotes3(n,pp1,pp2,pp3) [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1), p2=_SA_SPECSTRIZE(pp2), p3=_SA_SPECSTRIZE(pp3))] + +#define _Pre_impl_ [SAL_pre] +#define _Post_impl_ [SAL_post] +#define _Deref_impl_ [SAL_deref] +#define _Notref_impl_ [SAL_notref] + + +// Declare a function to be an annotation or primop (respectively). +// Done this way so that they don't appear in the regular compiler's +// namespace. +#define __ANNOTATION(fun) _SA_annotes0(SAL_annotation) void __SA_##fun; +#define __PRIMOP(type, fun) _SA_annotes0(SAL_primop) type __SA_##fun; +#define __QUALIFIER(fun) _SA_annotes0(SAL_qualifier) void __SA_##fun; + +// Benign declspec needed here for WindowsPREfast +#define __In_impl_ [SA_Pre(Valid=SA_Yes)] [SA_Pre(Deref=1, Notref=1, Access=SA_Read)] __declspec("SAL_pre SAL_valid") + +#elif _USE_DECLSPECS_FOR_SAL // ][ + +// Using declspecs for sal + +#define _SA_annotes0(n) __declspec(#n) +#define _SA_annotes1(n,pp1) __declspec(#n "(" _SA_SPECSTRIZE(pp1) ")" ) +#define _SA_annotes2(n,pp1,pp2) __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) ")") +#define _SA_annotes3(n,pp1,pp2,pp3) __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) "," _SA_SPECSTRIZE(pp3) ")") + +#define _Pre_impl_ _SA_annotes0(SAL_pre) +#define _Post_impl_ _SA_annotes0(SAL_post) +#define _Deref_impl_ _SA_annotes0(SAL_deref) +#define _Notref_impl_ _SA_annotes0(SAL_notref) + +// Declare a function to be an annotation or primop (respectively). +// Done this way so that they don't appear in the regular compiler's +// namespace. +#define __ANNOTATION(fun) _SA_annotes0(SAL_annotation) void __SA_##fun + +#define __PRIMOP(type, fun) _SA_annotes0(SAL_primop) type __SA_##fun + +#define __QUALIFIER(fun) _SA_annotes0(SAL_qualifier) void __SA_##fun; + +#define __In_impl_ _Pre_impl_ _SA_annotes0(SAL_valid) _Pre_impl_ _Deref_impl_ _Notref_impl_ _SA_annotes0(SAL_readonly) + +#else // ][ + +// Using "nothing" for sal + +#define _SA_annotes0(n) +#define _SA_annotes1(n,pp1) +#define _SA_annotes2(n,pp1,pp2) +#define _SA_annotes3(n,pp1,pp2,pp3) + +#define __ANNOTATION(fun) +#define __PRIMOP(type, fun) +#define __QUALIFIER(type, fun) + +#endif // ] + +#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL // [ + +// Declare annotations that need to be declared. +__ANNOTATION(SAL_useHeader(void)); +__ANNOTATION(SAL_bound(void)); +__ANNOTATION(SAL_allocator(void)); //??? resolve with PFD +__ANNOTATION(SAL_file_parser(__AuToQuOtE __In_impl_ char *, __In_impl_ char *)); +__ANNOTATION(SAL_source_code_content(__In_impl_ char *)); +__ANNOTATION(SAL_analysisHint(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_untrusted_data_source(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_untrusted_data_source_this(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_validated(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_validated_this(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_encoded(void)); +__ANNOTATION(SAL_adt(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_add_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_remove_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_transfer_adt_property_from(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_post_type(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_volatile(void)); +__ANNOTATION(SAL_nonvolatile(void)); +__ANNOTATION(SAL_entrypoint(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_blocksOn(__In_impl_ void*)); +__ANNOTATION(SAL_mustInspect(void)); + +// Only appears in model files, but needs to be declared. +__ANNOTATION(SAL_TypeName(__AuToQuOtE __In_impl_ char *)); + +// To be declared well-known soon. +__ANNOTATION(SAL_interlocked(void);) + +#pragma warning (suppress: 28227 28241) +__ANNOTATION(SAL_name(__In_impl_ char *, __In_impl_ char *, __In_impl_ char *);) + +__PRIMOP(char *, _Macro_value_(__In_impl_ char *)); +__PRIMOP(int, _Macro_defined_(__In_impl_ char *)); +__PRIMOP(char *, _Strstr_(__In_impl_ char *, __In_impl_ char *)); + +#endif // ] + +#if _USE_ATTRIBUTES_FOR_SAL // [ + +#define _Check_return_impl_ [SA_Post(MustCheck=SA_Yes)] + +#define _Success_impl_(expr) [SA_Success(Condition=#expr)] +#define _On_failure_impl_(annos) [SAL_context(p1="SAL_failed")] _Group_(_Post_impl_ _Group_(annos _SAL_nop_impl_)) + +#define _Printf_format_string_impl_ [SA_FormatString(Style="printf")] +#define _Scanf_format_string_impl_ [SA_FormatString(Style="scanf")] +#define _Scanf_s_format_string_impl_ [SA_FormatString(Style="scanf_s")] + +#define _In_bound_impl_ [SA_PreBound(Deref=0)] +#define _Out_bound_impl_ [SA_PostBound(Deref=0)] +#define _Ret_bound_impl_ [SA_PostBound(Deref=0)] +#define _Deref_in_bound_impl_ [SA_PreBound(Deref=1)] +#define _Deref_out_bound_impl_ [SA_PostBound(Deref=1)] +#define _Deref_ret_bound_impl_ [SA_PostBound(Deref=1)] + +#define __valid_impl Valid=SA_Yes +#define __maybevalid_impl Valid=SA_Maybe +#define __notvalid_impl Valid=SA_No + +#define __null_impl Null=SA_Yes +#define __maybenull_impl Null=SA_Maybe +#define __notnull_impl Null=SA_No + +#define __null_impl_notref Null=SA_Yes,Notref=1 +#define __maybenull_impl_notref Null=SA_Maybe,Notref=1 +#define __notnull_impl_notref Null=SA_No,Notref=1 + +#define __zterm_impl NullTerminated=SA_Yes +#define __maybezterm_impl NullTerminated=SA_Maybe +#define __maybzterm_impl NullTerminated=SA_Maybe +#define __notzterm_impl NullTerminated=SA_No + +#define __readaccess_impl Access=SA_Read +#define __writeaccess_impl Access=SA_Write +#define __allaccess_impl Access=SA_ReadWrite + +#define __readaccess_impl_notref Access=SA_Read,Notref=1 +#define __writeaccess_impl_notref Access=SA_Write,Notref=1 +#define __allaccess_impl_notref Access=SA_ReadWrite,Notref=1 + +#if _MSC_VER >= 1610 /*IFSTRIP=IGN*/ // [ + +// For SAL2, we need to expect general expressions. + +#define __cap_impl(size) WritableElements="\n"#size +#define __bytecap_impl(size) WritableBytes="\n"#size +#define __bytecount_impl(size) ValidBytes="\n"#size +#define __count_impl(size) ValidElements="\n"#size + +#else // ][ + +#define __cap_impl(size) WritableElements=#size +#define __bytecap_impl(size) WritableBytes=#size +#define __bytecount_impl(size) ValidBytes=#size +#define __count_impl(size) ValidElements=#size + +#endif // ] + +#define __cap_c_impl(size) WritableElementsConst=size +#define __cap_c_one_notref_impl WritableElementsConst=1,Notref=1 +#define __cap_for_impl(param) WritableElementsLength=#param +#define __cap_x_impl(size) WritableElements="\n@"#size + +#define __bytecap_c_impl(size) WritableBytesConst=size +#define __bytecap_x_impl(size) WritableBytes="\n@"#size + +#define __mult_impl(mult,size) __cap_impl((mult)*(size)) + +#define __count_c_impl(size) ValidElementsConst=size +#define __count_x_impl(size) ValidElements="\n@"#size + +#define __bytecount_c_impl(size) ValidBytesConst=size +#define __bytecount_x_impl(size) ValidBytes="\n@"#size + + +#define _At_impl_(target, annos) [SAL_at(p1=#target)] _Group_(annos) +#define _At_buffer_impl_(target, iter, bound, annos) [SAL_at_buffer(p1=#target, p2=#iter, p3=#bound)] _Group_(annos) +#define _When_impl_(expr, annos) [SAL_when(p1=#expr)] _Group_(annos) + +#define _Group_impl_(annos) [SAL_begin] annos [SAL_end] +#define _GrouP_impl_(annos) [SAL_BEGIN] annos [SAL_END] + +#define _Use_decl_anno_impl_ _SA_annotes0(SAL_useHeader) // this is a special case! + +#define _Pre1_impl_(p1) [SA_Pre(p1)] +#define _Pre2_impl_(p1,p2) [SA_Pre(p1,p2)] +#define _Pre3_impl_(p1,p2,p3) [SA_Pre(p1,p2,p3)] + +#define _Post1_impl_(p1) [SA_Post(p1)] +#define _Post2_impl_(p1,p2) [SA_Post(p1,p2)] +#define _Post3_impl_(p1,p2,p3) [SA_Post(p1,p2,p3)] + +#define _Ret1_impl_(p1) [SA_Post(p1)] +#define _Ret2_impl_(p1,p2) [SA_Post(p1,p2)] +#define _Ret3_impl_(p1,p2,p3) [SA_Post(p1,p2,p3)] + +#define _Deref_pre1_impl_(p1) [SA_Pre(Deref=1,p1)] +#define _Deref_pre2_impl_(p1,p2) [SA_Pre(Deref=1,p1,p2)] +#define _Deref_pre3_impl_(p1,p2,p3) [SA_Pre(Deref=1,p1,p2,p3)] + + +#define _Deref_post1_impl_(p1) [SA_Post(Deref=1,p1)] +#define _Deref_post2_impl_(p1,p2) [SA_Post(Deref=1,p1,p2)] +#define _Deref_post3_impl_(p1,p2,p3) [SA_Post(Deref=1,p1,p2,p3)] + +#define _Deref_ret1_impl_(p1) [SA_Post(Deref=1,p1)] +#define _Deref_ret2_impl_(p1,p2) [SA_Post(Deref=1,p1,p2)] +#define _Deref_ret3_impl_(p1,p2,p3) [SA_Post(Deref=1,p1,p2,p3)] + +#define _Deref2_pre1_impl_(p1) [SA_Pre(Deref=2,Notref=1,p1)] +#define _Deref2_post1_impl_(p1) [SA_Post(Deref=2,Notref=1,p1)] +#define _Deref2_ret1_impl_(p1) [SA_Post(Deref=2,Notref=1,p1)] + +// Obsolete -- may be needed for transition to attributes. +#define __inner_typefix(ctype) [SAL_typefix(p1=_SA_SPECSTRIZE(ctype))] +#define __inner_exceptthat [SAL_except] + + +#elif _USE_DECLSPECS_FOR_SAL // ][ + +#define _Check_return_impl_ __post _SA_annotes0(SAL_checkReturn) + +#define _Success_impl_(expr) _SA_annotes1(SAL_success, expr) +#define _On_failure_impl_(annos) _SA_annotes1(SAL_context, SAL_failed) _Group_(_Post_impl_ _Group_(_SAL_nop_impl_ annos)) + +#define _Printf_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "printf") +#define _Scanf_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "scanf") +#define _Scanf_s_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "scanf_s") + +#define _In_bound_impl_ _Pre_impl_ _Bound_impl_ +#define _Out_bound_impl_ _Post_impl_ _Bound_impl_ +#define _Ret_bound_impl_ _Post_impl_ _Bound_impl_ +#define _Deref_in_bound_impl_ _Deref_pre_impl_ _Bound_impl_ +#define _Deref_out_bound_impl_ _Deref_post_impl_ _Bound_impl_ +#define _Deref_ret_bound_impl_ _Deref_post_impl_ _Bound_impl_ + + +#define __null_impl _SA_annotes0(SAL_null) // _SA_annotes1(SAL_null, __yes) +#define __notnull_impl _SA_annotes0(SAL_notnull) // _SA_annotes1(SAL_null, __no) +#define __maybenull_impl _SA_annotes0(SAL_maybenull) // _SA_annotes1(SAL_null, __maybe) + +#define __valid_impl _SA_annotes0(SAL_valid) // _SA_annotes1(SAL_valid, __yes) +#define __notvalid_impl _SA_annotes0(SAL_notvalid) // _SA_annotes1(SAL_valid, __no) +#define __maybevalid_impl _SA_annotes0(SAL_maybevalid) // _SA_annotes1(SAL_valid, __maybe) + +#define __null_impl_notref _Notref_ _Null_impl_ +#define __maybenull_impl_notref _Notref_ _Maybenull_impl_ +#define __notnull_impl_notref _Notref_ _Notnull_impl_ + +#define __zterm_impl _SA_annotes1(SAL_nullTerminated, __yes) +#define __maybezterm_impl _SA_annotes1(SAL_nullTerminated, __maybe) +#define __maybzterm_impl _SA_annotes1(SAL_nullTerminated, __maybe) +#define __notzterm_impl _SA_annotes1(SAL_nullTerminated, __no) + +#define __readaccess_impl _SA_annotes1(SAL_access, 0x1) +#define __writeaccess_impl _SA_annotes1(SAL_access, 0x2) +#define __allaccess_impl _SA_annotes1(SAL_access, 0x3) + +#define __readaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x1) +#define __writeaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x2) +#define __allaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x3) + +#define __cap_impl(size) _SA_annotes1(SAL_writableTo,elementCount(size)) +#define __cap_c_impl(size) _SA_annotes1(SAL_writableTo,elementCount(size)) +#define __cap_c_one_notref_impl _Notref_ _SA_annotes1(SAL_writableTo,elementCount(1)) +#define __cap_for_impl(param) _SA_annotes1(SAL_writableTo,inexpressibleCount(sizeof(param))) +#define __cap_x_impl(size) _SA_annotes1(SAL_writableTo,inexpressibleCount(#size)) + +#define __bytecap_impl(size) _SA_annotes1(SAL_writableTo,byteCount(size)) +#define __bytecap_c_impl(size) _SA_annotes1(SAL_writableTo,byteCount(size)) +#define __bytecap_x_impl(size) _SA_annotes1(SAL_writableTo,inexpressibleCount(#size)) + +#define __mult_impl(mult,size) _SA_annotes1(SAL_writableTo,(mult)*(size)) + +#define __count_impl(size) _SA_annotes1(SAL_readableTo,elementCount(size)) +#define __count_c_impl(size) _SA_annotes1(SAL_readableTo,elementCount(size)) +#define __count_x_impl(size) _SA_annotes1(SAL_readableTo,inexpressibleCount(#size)) + +#define __bytecount_impl(size) _SA_annotes1(SAL_readableTo,byteCount(size)) +#define __bytecount_c_impl(size) _SA_annotes1(SAL_readableTo,byteCount(size)) +#define __bytecount_x_impl(size) _SA_annotes1(SAL_readableTo,inexpressibleCount(#size)) + +#define _At_impl_(target, annos) _SA_annotes0(SAL_at(target)) _Group_(annos) +#define _At_buffer_impl_(target, iter, bound, annos) _SA_annotes3(SAL_at_buffer, target, iter, bound) _Group_(annos) +#define _Group_impl_(annos) _SA_annotes0(SAL_begin) annos _SA_annotes0(SAL_end) +#define _GrouP_impl_(annos) _SA_annotes0(SAL_BEGIN) annos _SA_annotes0(SAL_END) +#define _When_impl_(expr, annos) _SA_annotes0(SAL_when(expr)) _Group_(annos) + +#define _Use_decl_anno_impl_ __declspec("SAL_useHeader()") // this is a special case! + +#define _Pre1_impl_(p1) _Pre_impl_ p1 +#define _Pre2_impl_(p1,p2) _Pre_impl_ p1 _Pre_impl_ p2 +#define _Pre3_impl_(p1,p2,p3) _Pre_impl_ p1 _Pre_impl_ p2 _Pre_impl_ p3 + +#define _Post1_impl_(p1) _Post_impl_ p1 +#define _Post2_impl_(p1,p2) _Post_impl_ p1 _Post_impl_ p2 +#define _Post3_impl_(p1,p2,p3) _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3 + +#define _Ret1_impl_(p1) _Post_impl_ p1 +#define _Ret2_impl_(p1,p2) _Post_impl_ p1 _Post_impl_ p2 +#define _Ret3_impl_(p1,p2,p3) _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3 + +#define _Deref_pre1_impl_(p1) _Deref_pre_impl_ p1 +#define _Deref_pre2_impl_(p1,p2) _Deref_pre_impl_ p1 _Deref_pre_impl_ p2 +#define _Deref_pre3_impl_(p1,p2,p3) _Deref_pre_impl_ p1 _Deref_pre_impl_ p2 _Deref_pre_impl_ p3 + +#define _Deref_post1_impl_(p1) _Deref_post_impl_ p1 +#define _Deref_post2_impl_(p1,p2) _Deref_post_impl_ p1 _Deref_post_impl_ p2 +#define _Deref_post3_impl_(p1,p2,p3) _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3 + +#define _Deref_ret1_impl_(p1) _Deref_post_impl_ p1 +#define _Deref_ret2_impl_(p1,p2) _Deref_post_impl_ p1 _Deref_post_impl_ p2 +#define _Deref_ret3_impl_(p1,p2,p3) _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3 + +#define _Deref2_pre1_impl_(p1) _Deref_pre_impl_ _Notref_impl_ _Deref_impl_ p1 +#define _Deref2_post1_impl_(p1) _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1 +#define _Deref2_ret1_impl_(p1) _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1 + +#define __inner_typefix(ctype) _SA_annotes1(SAL_typefix, ctype) +#define __inner_exceptthat _SA_annotes0(SAL_except) + +#elif defined(_MSC_EXTENSIONS) && !defined( MIDL_PASS ) && !defined(__midl) && !defined(RC_INVOKED) && defined(_PFT_VER) && _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // ][ + +// minimum attribute expansion for foreground build + +#pragma push_macro( "SA" ) +#pragma push_macro( "REPEATABLE" ) + +#ifdef __cplusplus // [ +#define SA( id ) id +#define REPEATABLE [repeatable] +#else // !__cplusplus // ][ +#define SA( id ) SA_##id +#define REPEATABLE +#endif // !__cplusplus // ] + +REPEATABLE +[source_annotation_attribute( SA( Parameter ) )] +struct __P_impl +{ +#ifdef __cplusplus // [ + __P_impl(); +#endif // ] + int __d_; +}; +typedef struct __P_impl __P_impl; + +REPEATABLE +[source_annotation_attribute( SA( ReturnValue ) )] +struct __R_impl +{ +#ifdef __cplusplus // [ + __R_impl(); +#endif // ] + int __d_; +}; +typedef struct __R_impl __R_impl; + +[source_annotation_attribute( SA( Method ) )] +struct __M_ +{ +#ifdef __cplusplus // [ + __M_(); +#endif // ] + int __d_; +}; +typedef struct __M_ __M_; + +[source_annotation_attribute( SA( All ) )] +struct __A_ +{ +#ifdef __cplusplus // [ + __A_(); +#endif // ] + int __d_; +}; +typedef struct __A_ __A_; + +[source_annotation_attribute( SA( Field ) )] +struct __F_ +{ +#ifdef __cplusplus // [ + __F_(); +#endif // ] + int __d_; +}; +typedef struct __F_ __F_; + +#pragma pop_macro( "REPEATABLE" ) +#pragma pop_macro( "SA" ) + + +#define _SAL_nop_impl_ + +#define _At_impl_(target, annos) [__A_(__d_=0)] +#define _At_buffer_impl_(target, iter, bound, annos) [__A_(__d_=0)] +#define _When_impl_(expr, annos) annos +#define _Group_impl_(annos) annos +#define _GrouP_impl_(annos) annos +#define _Use_decl_anno_impl_ [__M_(__d_=0)] + +#define _Points_to_data_impl_ [__P_impl(__d_=0)] +#define _Literal_impl_ [__P_impl(__d_=0)] +#define _Notliteral_impl_ [__P_impl(__d_=0)] + +#define _Pre_valid_impl_ [__P_impl(__d_=0)] +#define _Post_valid_impl_ [__P_impl(__d_=0)] +#define _Ret_valid_impl_ [__R_impl(__d_=0)] + +#define _Check_return_impl_ [__R_impl(__d_=0)] +#define _Must_inspect_impl_ [__R_impl(__d_=0)] + +#define _Success_impl_(expr) [__M_(__d_=0)] +#define _On_failure_impl_(expr) [__M_(__d_=0)] +#define _Always_impl_(expr) [__M_(__d_=0)] + +#define _Printf_format_string_impl_ [__P_impl(__d_=0)] +#define _Scanf_format_string_impl_ [__P_impl(__d_=0)] +#define _Scanf_s_format_string_impl_ [__P_impl(__d_=0)] + +#define _Raises_SEH_exception_impl_ [__M_(__d_=0)] +#define _Maybe_raises_SEH_exception_impl_ [__M_(__d_=0)] + +#define _In_bound_impl_ [__P_impl(__d_=0)] +#define _Out_bound_impl_ [__P_impl(__d_=0)] +#define _Ret_bound_impl_ [__R_impl(__d_=0)] +#define _Deref_in_bound_impl_ [__P_impl(__d_=0)] +#define _Deref_out_bound_impl_ [__P_impl(__d_=0)] +#define _Deref_ret_bound_impl_ [__R_impl(__d_=0)] + +#define _Range_impl_(min,max) [__P_impl(__d_=0)] +#define _In_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Out_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Ret_range_impl_(min,max) [__R_impl(__d_=0)] +#define _Deref_in_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Deref_out_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Deref_ret_range_impl_(min,max) [__R_impl(__d_=0)] + +#define _Field_range_impl_(min,max) [__F_(__d_=0)] + +#define _Pre_satisfies_impl_(cond) [__A_(__d_=0)] +#define _Post_satisfies_impl_(cond) [__A_(__d_=0)] +#define _Satisfies_impl_(cond) [__A_(__d_=0)] + +#define _Null_impl_ [__A_(__d_=0)] +#define _Notnull_impl_ [__A_(__d_=0)] +#define _Maybenull_impl_ [__A_(__d_=0)] + +#define _Valid_impl_ [__A_(__d_=0)] +#define _Notvalid_impl_ [__A_(__d_=0)] +#define _Maybevalid_impl_ [__A_(__d_=0)] + +#define _Readable_bytes_impl_(size) [__A_(__d_=0)] +#define _Readable_elements_impl_(size) [__A_(__d_=0)] +#define _Writable_bytes_impl_(size) [__A_(__d_=0)] +#define _Writable_elements_impl_(size) [__A_(__d_=0)] + +#define _Null_terminated_impl_ [__A_(__d_=0)] +#define _NullNull_terminated_impl_ [__A_(__d_=0)] + +#define _Pre_impl_ [__P_impl(__d_=0)] +#define _Pre1_impl_(p1) [__P_impl(__d_=0)] +#define _Pre2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Pre3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Post_impl_ [__P_impl(__d_=0)] +#define _Post1_impl_(p1) [__P_impl(__d_=0)] +#define _Post2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Post3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Ret1_impl_(p1) [__R_impl(__d_=0)] +#define _Ret2_impl_(p1,p2) [__R_impl(__d_=0)] +#define _Ret3_impl_(p1,p2,p3) [__R_impl(__d_=0)] + +#define _Deref_pre1_impl_(p1) [__P_impl(__d_=0)] +#define _Deref_pre2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Deref_pre3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Deref_post1_impl_(p1) [__P_impl(__d_=0)] +#define _Deref_post2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Deref_post3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Deref_ret1_impl_(p1) [__R_impl(__d_=0)] +#define _Deref_ret2_impl_(p1,p2) [__R_impl(__d_=0)] +#define _Deref_ret3_impl_(p1,p2,p3) [__R_impl(__d_=0)] + +#define _Deref2_pre1_impl_(p1) //[__P_impl(__d_=0)] +#define _Deref2_post1_impl_(p1) //[__P_impl(__d_=0)] +#define _Deref2_ret1_impl_(p1) //[__P_impl(__d_=0)] + +#else // ][ + + +#define _SAL_nop_impl_ X + +#define _At_impl_(target, annos) +#define _When_impl_(expr, annos) +#define _Group_impl_(annos) +#define _GrouP_impl_(annos) +#define _At_buffer_impl_(target, iter, bound, annos) +#define _Use_decl_anno_impl_ +#define _Points_to_data_impl_ +#define _Literal_impl_ +#define _Notliteral_impl_ +#define _Notref_impl_ + +#define _Pre_valid_impl_ +#define _Post_valid_impl_ +#define _Ret_valid_impl_ + +#define _Check_return_impl_ +#define _Must_inspect_impl_ + +#define _Success_impl_(expr) +#define _On_failure_impl_(annos) +#define _Always_impl_(annos) + +#define _Printf_format_string_impl_ +#define _Scanf_format_string_impl_ +#define _Scanf_s_format_string_impl_ + +#define _In_bound_impl_ +#define _Out_bound_impl_ +#define _Ret_bound_impl_ +#define _Deref_in_bound_impl_ +#define _Deref_out_bound_impl_ +#define _Deref_ret_bound_impl_ + +#define _Range_impl_(min,max) +#define _In_range_impl_(min,max) +#define _Out_range_impl_(min,max) +#define _Ret_range_impl_(min,max) +#define _Deref_in_range_impl_(min,max) +#define _Deref_out_range_impl_(min,max) +#define _Deref_ret_range_impl_(min,max) + +#define _Satisfies_impl_(expr) +#define _Pre_satisfies_impl_(expr) +#define _Post_satisfies_impl_(expr) + +#define _Null_impl_ +#define _Notnull_impl_ +#define _Maybenull_impl_ + +#define _Valid_impl_ +#define _Notvalid_impl_ +#define _Maybevalid_impl_ + +#define _Field_range_impl_(min,max) + +#define _Pre_impl_ +#define _Pre1_impl_(p1) +#define _Pre2_impl_(p1,p2) +#define _Pre3_impl_(p1,p2,p3) + +#define _Post_impl_ +#define _Post1_impl_(p1) +#define _Post2_impl_(p1,p2) +#define _Post3_impl_(p1,p2,p3) + +#define _Ret1_impl_(p1) +#define _Ret2_impl_(p1,p2) +#define _Ret3_impl_(p1,p2,p3) + +#define _Deref_pre1_impl_(p1) +#define _Deref_pre2_impl_(p1,p2) +#define _Deref_pre3_impl_(p1,p2,p3) + +#define _Deref_post1_impl_(p1) +#define _Deref_post2_impl_(p1,p2) +#define _Deref_post3_impl_(p1,p2,p3) + +#define _Deref_ret1_impl_(p1) +#define _Deref_ret2_impl_(p1,p2) +#define _Deref_ret3_impl_(p1,p2,p3) + +#define _Deref2_pre1_impl_(p1) +#define _Deref2_post1_impl_(p1) +#define _Deref2_ret1_impl_(p1) + +#define _Readable_bytes_impl_(size) +#define _Readable_elements_impl_(size) +#define _Writable_bytes_impl_(size) +#define _Writable_elements_impl_(size) + +#define _Null_terminated_impl_ +#define _NullNull_terminated_impl_ + +// Obsolete -- may be needed for transition to attributes. +#define __inner_typefix(ctype) +#define __inner_exceptthat + +#endif // ] + +// This section contains the deprecated annotations + +/* + ------------------------------------------------------------------------------- + Introduction + + sal.h provides a set of annotations to describe how a function uses its + parameters - the assumptions it makes about them, and the guarantees it makes + upon finishing. + + Annotations may be placed before either a function parameter's type or its return + type, and describe the function's behavior regarding the parameter or return value. + There are two classes of annotations: buffer annotations and advanced annotations. + Buffer annotations describe how functions use their pointer parameters, and + advanced annotations either describe complex/unusual buffer behavior, or provide + additional information about a parameter that is not otherwise expressible. + + ------------------------------------------------------------------------------- + Buffer Annotations + + The most important annotations in sal.h provide a consistent way to annotate + buffer parameters or return values for a function. Each of these annotations describes + a single buffer (which could be a string, a fixed-length or variable-length array, + or just a pointer) that the function interacts with: where it is, how large it is, + how much is initialized, and what the function does with it. + + The appropriate macro for a given buffer can be constructed using the table below. + Just pick the appropriate values from each category, and combine them together + with a leading underscore. Some combinations of values do not make sense as buffer + annotations. Only meaningful annotations can be added to your code; for a list of + these, see the buffer annotation definitions section. + + Only a single buffer annotation should be used for each parameter. + + |------------|------------|---------|--------|----------|----------|---------------| + | Level | Usage | Size | Output | NullTerm | Optional | Parameters | + |------------|------------|---------|--------|----------|----------|---------------| + | <> | <> | <> | <> | _z | <> | <> | + | _deref | _in | _ecount | _full | _nz | _opt | (size) | + | _deref_opt | _out | _bcount | _part | | | (size,length) | + | | _inout | | | | | | + | | | | | | | | + |------------|------------|---------|--------|----------|----------|---------------| + + Level: Describes the buffer pointer's level of indirection from the parameter or + return value 'p'. + + <> : p is the buffer pointer. + _deref : *p is the buffer pointer. p must not be NULL. + _deref_opt : *p may be the buffer pointer. p may be NULL, in which case the rest of + the annotation is ignored. + + Usage: Describes how the function uses the buffer. + + <> : The buffer is not accessed. If used on the return value or with _deref, the + function will provide the buffer, and it will be uninitialized at exit. + Otherwise, the caller must provide the buffer. This should only be used + for alloc and free functions. + _in : The function will only read from the buffer. The caller must provide the + buffer and initialize it. Cannot be used with _deref. + _out : The function will only write to the buffer. If used on the return value or + with _deref, the function will provide the buffer and initialize it. + Otherwise, the caller must provide the buffer, and the function will + initialize it. + _inout : The function may freely read from and write to the buffer. The caller must + provide the buffer and initialize it. If used with _deref, the buffer may + be reallocated by the function. + + Size: Describes the total size of the buffer. This may be less than the space actually + allocated for the buffer, in which case it describes the accessible amount. + + <> : No buffer size is given. If the type specifies the buffer size (such as + with LPSTR and LPWSTR), that amount is used. Otherwise, the buffer is one + element long. Must be used with _in, _out, or _inout. + _ecount : The buffer size is an explicit element count. + _bcount : The buffer size is an explicit byte count. + + Output: Describes how much of the buffer will be initialized by the function. For + _inout buffers, this also describes how much is initialized at entry. Omit this + category for _in buffers; they must be fully initialized by the caller. + + <> : The type specifies how much is initialized. For instance, a function initializing + an LPWSTR must NULL-terminate the string. + _full : The function initializes the entire buffer. + _part : The function initializes part of the buffer, and explicitly indicates how much. + + NullTerm: States if the present of a '\0' marks the end of valid elements in the buffer. + _z : A '\0' indicated the end of the buffer + _nz : The buffer may not be null terminated and a '\0' does not indicate the end of the + buffer. + Optional: Describes if the buffer itself is optional. + + <> : The pointer to the buffer must not be NULL. + _opt : The pointer to the buffer might be NULL. It will be checked before being dereferenced. + + Parameters: Gives explicit counts for the size and length of the buffer. + + <> : There is no explicit count. Use when neither _ecount nor _bcount is used. + (size) : Only the buffer's total size is given. Use with _ecount or _bcount but not _part. + (size,length) : The buffer's total size and initialized length are given. Use with _ecount_part + and _bcount_part. + + ------------------------------------------------------------------------------- + Buffer Annotation Examples + + LWSTDAPI_(BOOL) StrToIntExA( + __in LPCSTR pszString, + DWORD dwFlags, + __out int *piRet -- A pointer whose dereference will be filled in. + ); + + void MyPaintingFunction( + __in HWND hwndControl, -- An initialized read-only parameter. + __in_opt HDC hdcOptional, -- An initialized read-only parameter that might be NULL. + __inout IPropertyStore *ppsStore -- An initialized parameter that may be freely used + -- and modified. + ); + + LWSTDAPI_(BOOL) PathCompactPathExA( + __out_ecount(cchMax) LPSTR pszOut, -- A string buffer with cch elements that will + -- be NULL terminated on exit. + __in LPCSTR pszSrc, + UINT cchMax, + DWORD dwFlags + ); + + HRESULT SHLocalAllocBytes( + size_t cb, + __deref_bcount(cb) T **ppv -- A pointer whose dereference will be set to an + -- uninitialized buffer with cb bytes. + ); + + __inout_bcount_full(cb) : A buffer with cb elements that is fully initialized at + entry and exit, and may be written to by this function. + + __out_ecount_part(count, *countOut) : A buffer with count elements that will be + partially initialized by this function. The function indicates how much it + initialized by setting *countOut. + + ------------------------------------------------------------------------------- + Advanced Annotations + + Advanced annotations describe behavior that is not expressible with the regular + buffer macros. These may be used either to annotate buffer parameters that involve + complex or conditional behavior, or to enrich existing annotations with additional + information. + + __success(expr) f : + indicates whether function f succeeded or not. If is true at exit, + all the function's guarantees (as given by other annotations) must hold. If + is false at exit, the caller should not expect any of the function's guarantees + to hold. If not used, the function must always satisfy its guarantees. Added + automatically to functions that indicate success in standard ways, such as by + returning an HRESULT. + + __nullterminated p : + Pointer p is a buffer that may be read or written up to and including the first + NULL character or pointer. May be used on typedefs, which marks valid (properly + initialized) instances of that type as being NULL-terminated. + + __nullnullterminated p : + Pointer p is a buffer that may be read or written up to and including the first + sequence of two NULL characters or pointers. May be used on typedefs, which marks + valid instances of that type as being double-NULL terminated. + + __reserved v : + Value v must be 0/NULL, reserved for future use. + + __checkReturn v : + Return value v must not be ignored by callers of this function. + + __typefix(ctype) v : + Value v should be treated as an instance of ctype, rather than its declared type. + + __override f : + Specify C#-style 'override' behaviour for overriding virtual methods. + + __callback f : + Function f can be used as a function pointer. + + __format_string p : + Pointer p is a string that contains % markers in the style of printf. + + __blocksOn(resource) f : + Function f blocks on the resource 'resource'. + + FALLTHROUGH : + Annotates switch statement labels where fall-through is desired, to distinguish + from forgotten break statements. + + ------------------------------------------------------------------------------- + Advanced Annotation Examples + + __success(return != FALSE) LWSTDAPI_(BOOL) + PathCanonicalizeA(__out_ecount(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) : + pszBuf is only guaranteed to be NULL-terminated when TRUE is returned. + + typedef __nullterminated WCHAR* LPWSTR : Initialized LPWSTRs are NULL-terminated strings. + + __out_ecount(cch) __typefix(LPWSTR) void *psz : psz is a buffer parameter which will be + a NULL-terminated WCHAR string at exit, and which initially contains cch WCHARs. + + ------------------------------------------------------------------------------- +*/ + +#define __specstrings + +#ifdef __cplusplus // [ +#ifndef __nothrow // [ +# define __nothrow NOTHROW_DECL +#endif // ] +extern "C" { +#else // ][ +#ifndef __nothrow // [ +# define __nothrow +#endif // ] +#endif /* #ifdef __cplusplus */ // ] + + +/* + ------------------------------------------------------------------------------- + Helper Macro Definitions + + These express behavior common to many of the high-level annotations. + DO NOT USE THESE IN YOUR CODE. + ------------------------------------------------------------------------------- +*/ + +/* + The helper annotations are only understood by the compiler version used by + various defect detection tools. When the regular compiler is running, they + are defined into nothing, and do not affect the compiled code. +*/ + +#if !defined(__midl) && defined(_PREFAST_) // [ + + /* + In the primitive "SAL_*" annotations "SAL" stands for Standard + Annotation Language. These "SAL_*" annotations are the + primitives the compiler understands and high-level MACROs + will decompose into these primivates. + */ + + #define _SA_SPECSTRIZE( x ) #x + + /* + __notnull p + __maybenull p + + Annotates a pointer p. States that pointer p is never null or maybe null. + */ + + #define __notnull _Notnull_impl_ + #define __maybenull _Maybenull_impl_ + + /* + __readonly l + __notreadonly l + __maybereadonly l + + Annotates a location l. States that location l is not modified after + this point. If the annotation is placed on the precondition state of + a function, the restriction only applies until the postcondition state + of the function. __maybereadonly states that the annotated location + may be modified, whereas __notreadonly states that a location must be + modified. + */ + + #define __readonly _Pre1_impl_(__readaccess_impl) + #define __notreadonly _Pre1_impl_(__allaccess_impl) + #define __maybereadonly _Pre1_impl_(__readaccess_impl) + + /* + __valid v + __notvalid v + __maybevalid v + + Annotates any value v. States that the value satisfies all properties of + valid values of its type. For example, for a string buffer, valid means + that the buffer pointer is either NULL or points to a NULL-terminated string. + */ + + #define __valid _Valid_impl_ + #define __notvalid _Notvalid_impl_ + #define __maybevalid _Maybevalid_impl_ + + /* + __readableTo(extent) p + + Annotates a buffer pointer p. If the buffer can be read, extent describes + how much of the buffer is readable. For a reader of the buffer, this is + an explicit permission to read up to that amount, rather than a restriction to + read only up to it. + */ + + #define __readableTo(extent) _SA_annotes1(SAL_readableTo, extent) + + /* + + __elem_readableTo(size) + + Annotates a buffer pointer p as being readable to size elements. + */ + + #define __elem_readableTo(size) _SA_annotes1(SAL_readableTo, elementCount( size )) + + /* + __byte_readableTo(size) + + Annotates a buffer pointer p as being readable to size bytes. + */ + #define __byte_readableTo(size) _SA_annotes1(SAL_readableTo, byteCount(size)) + + /* + __writableTo(extent) p + + Annotates a buffer pointer p. If the buffer can be modified, extent + describes how much of the buffer is writable (usually the allocation + size). For a writer of the buffer, this is an explicit permission to + write up to that amount, rather than a restriction to write only up to it. + */ + #define __writableTo(size) _SA_annotes1(SAL_writableTo, size) + + /* + __elem_writableTo(size) + + Annotates a buffer pointer p as being writable to size elements. + */ + #define __elem_writableTo(size) _SA_annotes1(SAL_writableTo, elementCount( size )) + + /* + __byte_writableTo(size) + + Annotates a buffer pointer p as being writable to size bytes. + */ + #define __byte_writableTo(size) _SA_annotes1(SAL_writableTo, byteCount( size)) + + /* + __deref p + + Annotates a pointer p. The next annotation applies one dereference down + in the type. If readableTo(p, size) then the next annotation applies to + all elements *(p+i) for which i satisfies the size. If p is a pointer + to a struct, the next annotation applies to all fields of the struct. + */ + #define __deref _Deref_impl_ + + /* + __pre __next_annotation + + The next annotation applies in the precondition state + */ + #define __pre _Pre_impl_ + + /* + __post __next_annotation + + The next annotation applies in the postcondition state + */ + #define __post _Post_impl_ + + /* + __precond() + + When is true, the next annotation applies in the precondition state + (currently not enabled) + */ + #define __precond(expr) __pre + + /* + __postcond() + + When is true, the next annotation applies in the postcondition state + (currently not enabled) + */ + #define __postcond(expr) __post + + /* + __exceptthat + + Given a set of annotations Q containing __exceptthat maybeP, the effect of + the except clause is to erase any P or notP annotations (explicit or + implied) within Q at the same level of dereferencing that the except + clause appears, and to replace it with maybeP. + + Example 1: __valid __pre_except_maybenull on a pointer p means that the + pointer may be null, and is otherwise valid, thus overriding + the implicit notnull annotation implied by __valid on + pointers. + + Example 2: __valid __deref __pre_except_maybenull on an int **p means + that p is not null (implied by valid), but the elements + pointed to by p could be null, and are otherwise valid. + */ + #define __exceptthat __inner_exceptthat + + /* + _refparam + + Added to all out parameter macros to indicate that they are all reference + parameters. + */ + #define __refparam _Notref_ __deref __notreadonly + + /* + __inner_* + + Helper macros that directly correspond to certain high-level annotations. + + */ + + /* + Macros to classify the entrypoints and indicate their category. + + Pre-defined control point categories include: RPC, LPC, DeviceDriver, UserToKernel, ISAPI, COM. + + */ + #define __inner_control_entrypoint(category) _SA_annotes2(SAL_entrypoint, controlEntry, category) + + + /* + Pre-defined data entry point categories include: Registry, File, Network. + */ + #define __inner_data_entrypoint(category) _SA_annotes2(SAL_entrypoint, dataEntry, category) + + #define __inner_override _SA_annotes0(__override) + #define __inner_callback _SA_annotes0(__callback) + #define __inner_blocksOn(resource) _SA_annotes1(SAL_blocksOn, resource) + + #define __post_except_maybenull __post __inner_exceptthat _Maybenull_impl_ + #define __pre_except_maybenull __pre __inner_exceptthat _Maybenull_impl_ + + #define __post_deref_except_maybenull __post __deref __inner_exceptthat _Maybenull_impl_ + #define __pre_deref_except_maybenull __pre __deref __inner_exceptthat _Maybenull_impl_ + + #define __inexpressible_readableTo(size) _Readable_elements_impl_(_Inexpressible_(size)) + #define __inexpressible_writableTo(size) _Writable_elements_impl_(_Inexpressible_(size)) + + +#else // ][ + #define __notnull + #define __deref + #define __maybenull + #define __readonly + #define __notreadonly + #define __maybereadonly + #define __valid + #define __notvalid + #define __maybevalid + #define __readableTo(extent) + #define __elem_readableTo(size) + #define __byte_readableTo(size) + #define __writableTo(size) + #define __elem_writableTo(size) + #define __byte_writableTo(size) + #define __pre + #define __post + #define __precond(expr) + #define __postcond(expr) + #define __exceptthat + #define __inner_override + #define __inner_callback + #define __inner_blocksOn(resource) + #define __refparam + #define __inner_control_entrypoint(category) + #define __inner_data_entrypoint(category) + + #define __post_except_maybenull + #define __pre_except_maybenull + #define __post_deref_except_maybenull + #define __pre_deref_except_maybenull + + #define __inexpressible_readableTo(size) + #define __inexpressible_writableTo(size) + +#endif /* #if !defined(__midl) && defined(_PREFAST_) */ // ] + +/* +------------------------------------------------------------------------------- +Buffer Annotation Definitions + +Any of these may be used to directly annotate functions, but only one should +be used for each parameter. To determine which annotation to use for a given +buffer, use the table in the buffer annotations section. +------------------------------------------------------------------------------- +*/ + +#define __ecount(size) _SAL1_Source_(__ecount, (size), __notnull __elem_writableTo(size)) +#define __bcount(size) _SAL1_Source_(__bcount, (size), __notnull __byte_writableTo(size)) +#define __in_ecount(size) _SAL1_Source_(__in_ecount, (size), _In_reads_(size)) +#define __in_bcount(size) _SAL1_Source_(__in_bcount, (size), _In_reads_bytes_(size)) +#define __in_z _SAL1_Source_(__in_z, (), _In_z_) +#define __in_ecount_z(size) _SAL1_Source_(__in_ecount_z, (size), _In_reads_z_(size)) +#define __in_bcount_z(size) _SAL1_Source_(__in_bcount_z, (size), __in_bcount(size) __pre __nullterminated) +#define __in_nz _SAL1_Source_(__in_nz, (), __in) +#define __in_ecount_nz(size) _SAL1_Source_(__in_ecount_nz, (size), __in_ecount(size)) +#define __in_bcount_nz(size) _SAL1_Source_(__in_bcount_nz, (size), __in_bcount(size)) +#define __out_ecount(size) _SAL1_Source_(__out_ecount, (size), _Out_writes_(size)) +#define __out_bcount(size) _SAL1_Source_(__out_bcount, (size), _Out_writes_bytes_(size)) +#define __out_ecount_part(size,length) _SAL1_Source_(__out_ecount_part, (size,length), _Out_writes_to_(size,length)) +#define __out_bcount_part(size,length) _SAL1_Source_(__out_bcount_part, (size,length), _Out_writes_bytes_to_(size,length)) +#define __out_ecount_full(size) _SAL1_Source_(__out_ecount_full, (size), _Out_writes_all_(size)) +#define __out_bcount_full(size) _SAL1_Source_(__out_bcount_full, (size), _Out_writes_bytes_all_(size)) +#define __out_z _SAL1_Source_(__out_z, (), __post __valid __refparam __post __nullterminated) +#define __out_z_opt _SAL1_Source_(__out_z_opt, (), __post __valid __refparam __post __nullterminated __pre_except_maybenull) +#define __out_ecount_z(size) _SAL1_Source_(__out_ecount_z, (size), __ecount(size) __post __valid __refparam __post __nullterminated) +#define __out_bcount_z(size) _SAL1_Source_(__out_bcount_z, (size), __bcount(size) __post __valid __refparam __post __nullterminated) +#define __out_ecount_part_z(size,length) _SAL1_Source_(__out_ecount_part_z, (size,length), __out_ecount_part(size,length) __post __nullterminated) +#define __out_bcount_part_z(size,length) _SAL1_Source_(__out_bcount_part_z, (size,length), __out_bcount_part(size,length) __post __nullterminated) +#define __out_ecount_full_z(size) _SAL1_Source_(__out_ecount_full_z, (size), __out_ecount_full(size) __post __nullterminated) +#define __out_bcount_full_z(size) _SAL1_Source_(__out_bcount_full_z, (size), __out_bcount_full(size) __post __nullterminated) +#define __out_nz _SAL1_Source_(__out_nz, (), __post __valid __refparam) +#define __out_nz_opt _SAL1_Source_(__out_nz_opt, (), __post __valid __refparam __post_except_maybenull_) +#define __out_ecount_nz(size) _SAL1_Source_(__out_ecount_nz, (size), __ecount(size) __post __valid __refparam) +#define __out_bcount_nz(size) _SAL1_Source_(__out_bcount_nz, (size), __bcount(size) __post __valid __refparam) +#define __inout _SAL1_Source_(__inout, (), _Inout_) +#define __inout_ecount(size) _SAL1_Source_(__inout_ecount, (size), _Inout_updates_(size)) +#define __inout_bcount(size) _SAL1_Source_(__inout_bcount, (size), _Inout_updates_bytes_(size)) +#define __inout_ecount_part(size,length) _SAL1_Source_(__inout_ecount_part, (size,length), _Inout_updates_to_(size,length)) +#define __inout_bcount_part(size,length) _SAL1_Source_(__inout_bcount_part, (size,length), _Inout_updates_bytes_to_(size,length)) +#define __inout_ecount_full(size) _SAL1_Source_(__inout_ecount_full, (size), _Inout_updates_all_(size)) +#define __inout_bcount_full(size) _SAL1_Source_(__inout_bcount_full, (size), _Inout_updates_bytes_all_(size)) +#define __inout_z _SAL1_Source_(__inout_z, (), _Inout_z_) +#define __inout_ecount_z(size) _SAL1_Source_(__inout_ecount_z, (size), _Inout_updates_z_(size)) +#define __inout_bcount_z(size) _SAL1_Source_(__inout_bcount_z, (size), __inout_bcount(size) __pre __nullterminated __post __nullterminated) +#define __inout_nz _SAL1_Source_(__inout_nz, (), __inout) +#define __inout_ecount_nz(size) _SAL1_Source_(__inout_ecount_nz, (size), __inout_ecount(size)) +#define __inout_bcount_nz(size) _SAL1_Source_(__inout_bcount_nz, (size), __inout_bcount(size)) +#define __ecount_opt(size) _SAL1_Source_(__ecount_opt, (size), __ecount(size) __pre_except_maybenull) +#define __bcount_opt(size) _SAL1_Source_(__bcount_opt, (size), __bcount(size) __pre_except_maybenull) +#define __in_opt _SAL1_Source_(__in_opt, (), _In_opt_) +#define __in_ecount_opt(size) _SAL1_Source_(__in_ecount_opt, (size), _In_reads_opt_(size)) +#define __in_bcount_opt(size) _SAL1_Source_(__in_bcount_opt, (size), _In_reads_bytes_opt_(size)) +#define __in_z_opt _SAL1_Source_(__in_z_opt, (), _In_opt_z_) +#define __in_ecount_z_opt(size) _SAL1_Source_(__in_ecount_z_opt, (size), __in_ecount_opt(size) __pre __nullterminated) +#define __in_bcount_z_opt(size) _SAL1_Source_(__in_bcount_z_opt, (size), __in_bcount_opt(size) __pre __nullterminated) +#define __in_nz_opt _SAL1_Source_(__in_nz_opt, (), __in_opt) +#define __in_ecount_nz_opt(size) _SAL1_Source_(__in_ecount_nz_opt, (size), __in_ecount_opt(size)) +#define __in_bcount_nz_opt(size) _SAL1_Source_(__in_bcount_nz_opt, (size), __in_bcount_opt(size)) +#define __out_opt _SAL1_Source_(__out_opt, (), _Out_opt_) +#define __out_ecount_opt(size) _SAL1_Source_(__out_ecount_opt, (size), _Out_writes_opt_(size)) +#define __out_bcount_opt(size) _SAL1_Source_(__out_bcount_opt, (size), _Out_writes_bytes_opt_(size)) +#define __out_ecount_part_opt(size,length) _SAL1_Source_(__out_ecount_part_opt, (size,length), __out_ecount_part(size,length) __pre_except_maybenull) +#define __out_bcount_part_opt(size,length) _SAL1_Source_(__out_bcount_part_opt, (size,length), __out_bcount_part(size,length) __pre_except_maybenull) +#define __out_ecount_full_opt(size) _SAL1_Source_(__out_ecount_full_opt, (size), __out_ecount_full(size) __pre_except_maybenull) +#define __out_bcount_full_opt(size) _SAL1_Source_(__out_bcount_full_opt, (size), __out_bcount_full(size) __pre_except_maybenull) +#define __out_ecount_z_opt(size) _SAL1_Source_(__out_ecount_z_opt, (size), __out_ecount_opt(size) __post __nullterminated) +#define __out_bcount_z_opt(size) _SAL1_Source_(__out_bcount_z_opt, (size), __out_bcount_opt(size) __post __nullterminated) +#define __out_ecount_part_z_opt(size,length) _SAL1_Source_(__out_ecount_part_z_opt, (size,length), __out_ecount_part_opt(size,length) __post __nullterminated) +#define __out_bcount_part_z_opt(size,length) _SAL1_Source_(__out_bcount_part_z_opt, (size,length), __out_bcount_part_opt(size,length) __post __nullterminated) +#define __out_ecount_full_z_opt(size) _SAL1_Source_(__out_ecount_full_z_opt, (size), __out_ecount_full_opt(size) __post __nullterminated) +#define __out_bcount_full_z_opt(size) _SAL1_Source_(__out_bcount_full_z_opt, (size), __out_bcount_full_opt(size) __post __nullterminated) +#define __out_ecount_nz_opt(size) _SAL1_Source_(__out_ecount_nz_opt, (size), __out_ecount_opt(size) __post __nullterminated) +#define __out_bcount_nz_opt(size) _SAL1_Source_(__out_bcount_nz_opt, (size), __out_bcount_opt(size) __post __nullterminated) +#define __inout_opt _SAL1_Source_(__inout_opt, (), _Inout_opt_) +#define __inout_ecount_opt(size) _SAL1_Source_(__inout_ecount_opt, (size), __inout_ecount(size) __pre_except_maybenull) +#define __inout_bcount_opt(size) _SAL1_Source_(__inout_bcount_opt, (size), __inout_bcount(size) __pre_except_maybenull) +#define __inout_ecount_part_opt(size,length) _SAL1_Source_(__inout_ecount_part_opt, (size,length), __inout_ecount_part(size,length) __pre_except_maybenull) +#define __inout_bcount_part_opt(size,length) _SAL1_Source_(__inout_bcount_part_opt, (size,length), __inout_bcount_part(size,length) __pre_except_maybenull) +#define __inout_ecount_full_opt(size) _SAL1_Source_(__inout_ecount_full_opt, (size), __inout_ecount_full(size) __pre_except_maybenull) +#define __inout_bcount_full_opt(size) _SAL1_Source_(__inout_bcount_full_opt, (size), __inout_bcount_full(size) __pre_except_maybenull) +#define __inout_z_opt _SAL1_Source_(__inout_z_opt, (), __inout_opt __pre __nullterminated __post __nullterminated) +#define __inout_ecount_z_opt(size) _SAL1_Source_(__inout_ecount_z_opt, (size), __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated) +#define __inout_ecount_z_opt(size) _SAL1_Source_(__inout_ecount_z_opt, (size), __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated) +#define __inout_bcount_z_opt(size) _SAL1_Source_(__inout_bcount_z_opt, (size), __inout_bcount_opt(size)) +#define __inout_nz_opt _SAL1_Source_(__inout_nz_opt, (), __inout_opt) +#define __inout_ecount_nz_opt(size) _SAL1_Source_(__inout_ecount_nz_opt, (size), __inout_ecount_opt(size)) +#define __inout_bcount_nz_opt(size) _SAL1_Source_(__inout_bcount_nz_opt, (size), __inout_bcount_opt(size)) +#define __deref_ecount(size) _SAL1_Source_(__deref_ecount, (size), _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) __post _Notref_ __deref _Notref_ __notnull __post __deref __elem_writableTo(size)) +#define __deref_bcount(size) _SAL1_Source_(__deref_bcount, (size), _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) __post _Notref_ __deref _Notref_ __notnull __post __deref __byte_writableTo(size)) +#define __deref_out _SAL1_Source_(__deref_out, (), _Outptr_) +#define __deref_out_ecount(size) _SAL1_Source_(__deref_out_ecount, (size), _Outptr_result_buffer_(size)) +#define __deref_out_bcount(size) _SAL1_Source_(__deref_out_bcount, (size), _Outptr_result_bytebuffer_(size)) +#define __deref_out_ecount_part(size,length) _SAL1_Source_(__deref_out_ecount_part, (size,length), _Outptr_result_buffer_to_(size,length)) +#define __deref_out_bcount_part(size,length) _SAL1_Source_(__deref_out_bcount_part, (size,length), _Outptr_result_bytebuffer_to_(size,length)) +#define __deref_out_ecount_full(size) _SAL1_Source_(__deref_out_ecount_full, (size), __deref_out_ecount_part(size,size)) +#define __deref_out_bcount_full(size) _SAL1_Source_(__deref_out_bcount_full, (size), __deref_out_bcount_part(size,size)) +#define __deref_out_z _SAL1_Source_(__deref_out_z, (), _Outptr_result_z_) +#define __deref_out_ecount_z(size) _SAL1_Source_(__deref_out_ecount_z, (size), __deref_out_ecount(size) __post __deref __nullterminated) +#define __deref_out_bcount_z(size) _SAL1_Source_(__deref_out_bcount_z, (size), __deref_out_bcount(size) __post __deref __nullterminated) +#define __deref_out_nz _SAL1_Source_(__deref_out_nz, (), __deref_out) +#define __deref_out_ecount_nz(size) _SAL1_Source_(__deref_out_ecount_nz, (size), __deref_out_ecount(size)) +#define __deref_out_bcount_nz(size) _SAL1_Source_(__deref_out_bcount_nz, (size), __deref_out_ecount(size)) +#define __deref_inout _SAL1_Source_(__deref_inout, (), _Notref_ __notnull _Notref_ __elem_readableTo(1) __pre __deref __valid __post _Notref_ __deref __valid __refparam) +#define __deref_inout_z _SAL1_Source_(__deref_inout_z, (), __deref_inout __pre __deref __nullterminated __post _Notref_ __deref __nullterminated) +#define __deref_inout_ecount(size) _SAL1_Source_(__deref_inout_ecount, (size), __deref_inout __pre __deref __elem_writableTo(size) __post _Notref_ __deref __elem_writableTo(size)) +#define __deref_inout_bcount(size) _SAL1_Source_(__deref_inout_bcount, (size), __deref_inout __pre __deref __byte_writableTo(size) __post _Notref_ __deref __byte_writableTo(size)) +#define __deref_inout_ecount_part(size,length) _SAL1_Source_(__deref_inout_ecount_part, (size,length), __deref_inout_ecount(size) __pre __deref __elem_readableTo(length) __post __deref __elem_readableTo(length)) +#define __deref_inout_bcount_part(size,length) _SAL1_Source_(__deref_inout_bcount_part, (size,length), __deref_inout_bcount(size) __pre __deref __byte_readableTo(length) __post __deref __byte_readableTo(length)) +#define __deref_inout_ecount_full(size) _SAL1_Source_(__deref_inout_ecount_full, (size), __deref_inout_ecount_part(size,size)) +#define __deref_inout_bcount_full(size) _SAL1_Source_(__deref_inout_bcount_full, (size), __deref_inout_bcount_part(size,size)) +#define __deref_inout_ecount_z(size) _SAL1_Source_(__deref_inout_ecount_z, (size), __deref_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_bcount_z(size) _SAL1_Source_(__deref_inout_bcount_z, (size), __deref_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_nz _SAL1_Source_(__deref_inout_nz, (), __deref_inout) +#define __deref_inout_ecount_nz(size) _SAL1_Source_(__deref_inout_ecount_nz, (size), __deref_inout_ecount(size)) +#define __deref_inout_bcount_nz(size) _SAL1_Source_(__deref_inout_bcount_nz, (size), __deref_inout_ecount(size)) +#define __deref_ecount_opt(size) _SAL1_Source_(__deref_ecount_opt, (size), __deref_ecount(size) __post_deref_except_maybenull) +#define __deref_bcount_opt(size) _SAL1_Source_(__deref_bcount_opt, (size), __deref_bcount(size) __post_deref_except_maybenull) +#define __deref_out_opt _SAL1_Source_(__deref_out_opt, (), __deref_out __post_deref_except_maybenull) +#define __deref_out_ecount_opt(size) _SAL1_Source_(__deref_out_ecount_opt, (size), __deref_out_ecount(size) __post_deref_except_maybenull) +#define __deref_out_bcount_opt(size) _SAL1_Source_(__deref_out_bcount_opt, (size), __deref_out_bcount(size) __post_deref_except_maybenull) +#define __deref_out_ecount_part_opt(size,length) _SAL1_Source_(__deref_out_ecount_part_opt, (size,length), __deref_out_ecount_part(size,length) __post_deref_except_maybenull) +#define __deref_out_bcount_part_opt(size,length) _SAL1_Source_(__deref_out_bcount_part_opt, (size,length), __deref_out_bcount_part(size,length) __post_deref_except_maybenull) +#define __deref_out_ecount_full_opt(size) _SAL1_Source_(__deref_out_ecount_full_opt, (size), __deref_out_ecount_full(size) __post_deref_except_maybenull) +#define __deref_out_bcount_full_opt(size) _SAL1_Source_(__deref_out_bcount_full_opt, (size), __deref_out_bcount_full(size) __post_deref_except_maybenull) +#define __deref_out_z_opt _SAL1_Source_(__deref_out_z_opt, (), _Outptr_result_maybenull_z_) +#define __deref_out_ecount_z_opt(size) _SAL1_Source_(__deref_out_ecount_z_opt, (size), __deref_out_ecount_opt(size) __post __deref __nullterminated) +#define __deref_out_bcount_z_opt(size) _SAL1_Source_(__deref_out_bcount_z_opt, (size), __deref_out_bcount_opt(size) __post __deref __nullterminated) +#define __deref_out_nz_opt _SAL1_Source_(__deref_out_nz_opt, (), __deref_out_opt) +#define __deref_out_ecount_nz_opt(size) _SAL1_Source_(__deref_out_ecount_nz_opt, (size), __deref_out_ecount_opt(size)) +#define __deref_out_bcount_nz_opt(size) _SAL1_Source_(__deref_out_bcount_nz_opt, (size), __deref_out_bcount_opt(size)) +#define __deref_inout_opt _SAL1_Source_(__deref_inout_opt, (), __deref_inout __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_ecount_opt(size) _SAL1_Source_(__deref_inout_ecount_opt, (size), __deref_inout_ecount(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_bcount_opt(size) _SAL1_Source_(__deref_inout_bcount_opt, (size), __deref_inout_bcount(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_ecount_part_opt(size,length) _SAL1_Source_(__deref_inout_ecount_part_opt, (size,length), __deref_inout_ecount_part(size,length) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_bcount_part_opt(size,length) _SAL1_Source_(__deref_inout_bcount_part_opt, (size,length), __deref_inout_bcount_part(size,length) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_ecount_full_opt(size) _SAL1_Source_(__deref_inout_ecount_full_opt, (size), __deref_inout_ecount_full(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_bcount_full_opt(size) _SAL1_Source_(__deref_inout_bcount_full_opt, (size), __deref_inout_bcount_full(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_z_opt _SAL1_Source_(__deref_inout_z_opt, (), __deref_inout_opt __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_ecount_z_opt(size) _SAL1_Source_(__deref_inout_ecount_z_opt, (size), __deref_inout_ecount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_bcount_z_opt(size) _SAL1_Source_(__deref_inout_bcount_z_opt, (size), __deref_inout_bcount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_nz_opt _SAL1_Source_(__deref_inout_nz_opt, (), __deref_inout_opt) +#define __deref_inout_ecount_nz_opt(size) _SAL1_Source_(__deref_inout_ecount_nz_opt, (size), __deref_inout_ecount_opt(size)) +#define __deref_inout_bcount_nz_opt(size) _SAL1_Source_(__deref_inout_bcount_nz_opt, (size), __deref_inout_bcount_opt(size)) +#define __deref_opt_ecount(size) _SAL1_Source_(__deref_opt_ecount, (size), __deref_ecount(size) __pre_except_maybenull) +#define __deref_opt_bcount(size) _SAL1_Source_(__deref_opt_bcount, (size), __deref_bcount(size) __pre_except_maybenull) +#define __deref_opt_out _SAL1_Source_(__deref_opt_out, (), _Outptr_opt_) +#define __deref_opt_out_z _SAL1_Source_(__deref_opt_out_z, (), _Outptr_opt_result_z_) +#define __deref_opt_out_ecount(size) _SAL1_Source_(__deref_opt_out_ecount, (size), __deref_out_ecount(size) __pre_except_maybenull) +#define __deref_opt_out_bcount(size) _SAL1_Source_(__deref_opt_out_bcount, (size), __deref_out_bcount(size) __pre_except_maybenull) +#define __deref_opt_out_ecount_part(size,length) _SAL1_Source_(__deref_opt_out_ecount_part, (size,length), __deref_out_ecount_part(size,length) __pre_except_maybenull) +#define __deref_opt_out_bcount_part(size,length) _SAL1_Source_(__deref_opt_out_bcount_part, (size,length), __deref_out_bcount_part(size,length) __pre_except_maybenull) +#define __deref_opt_out_ecount_full(size) _SAL1_Source_(__deref_opt_out_ecount_full, (size), __deref_out_ecount_full(size) __pre_except_maybenull) +#define __deref_opt_out_bcount_full(size) _SAL1_Source_(__deref_opt_out_bcount_full, (size), __deref_out_bcount_full(size) __pre_except_maybenull) +#define __deref_opt_inout _SAL1_Source_(__deref_opt_inout, (), _Inout_opt_) +#define __deref_opt_inout_ecount(size) _SAL1_Source_(__deref_opt_inout_ecount, (size), __deref_inout_ecount(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount(size) _SAL1_Source_(__deref_opt_inout_bcount, (size), __deref_inout_bcount(size) __pre_except_maybenull) +#define __deref_opt_inout_ecount_part(size,length) _SAL1_Source_(__deref_opt_inout_ecount_part, (size,length), __deref_inout_ecount_part(size,length) __pre_except_maybenull) +#define __deref_opt_inout_bcount_part(size,length) _SAL1_Source_(__deref_opt_inout_bcount_part, (size,length), __deref_inout_bcount_part(size,length) __pre_except_maybenull) +#define __deref_opt_inout_ecount_full(size) _SAL1_Source_(__deref_opt_inout_ecount_full, (size), __deref_inout_ecount_full(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount_full(size) _SAL1_Source_(__deref_opt_inout_bcount_full, (size), __deref_inout_bcount_full(size) __pre_except_maybenull) +#define __deref_opt_inout_z _SAL1_Source_(__deref_opt_inout_z, (), __deref_opt_inout __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_ecount_z(size) _SAL1_Source_(__deref_opt_inout_ecount_z, (size), __deref_opt_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_bcount_z(size) _SAL1_Source_(__deref_opt_inout_bcount_z, (size), __deref_opt_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_nz _SAL1_Source_(__deref_opt_inout_nz, (), __deref_opt_inout) +#define __deref_opt_inout_ecount_nz(size) _SAL1_Source_(__deref_opt_inout_ecount_nz, (size), __deref_opt_inout_ecount(size)) +#define __deref_opt_inout_bcount_nz(size) _SAL1_Source_(__deref_opt_inout_bcount_nz, (size), __deref_opt_inout_bcount(size)) +#define __deref_opt_ecount_opt(size) _SAL1_Source_(__deref_opt_ecount_opt, (size), __deref_ecount_opt(size) __pre_except_maybenull) +#define __deref_opt_bcount_opt(size) _SAL1_Source_(__deref_opt_bcount_opt, (size), __deref_bcount_opt(size) __pre_except_maybenull) +#define __deref_opt_out_opt _SAL1_Source_(__deref_opt_out_opt, (), _Outptr_opt_result_maybenull_) +#define __deref_opt_out_ecount_opt(size) _SAL1_Source_(__deref_opt_out_ecount_opt, (size), __deref_out_ecount_opt(size) __pre_except_maybenull) +#define __deref_opt_out_bcount_opt(size) _SAL1_Source_(__deref_opt_out_bcount_opt, (size), __deref_out_bcount_opt(size) __pre_except_maybenull) +#define __deref_opt_out_ecount_part_opt(size,length) _SAL1_Source_(__deref_opt_out_ecount_part_opt, (size,length), __deref_out_ecount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_out_bcount_part_opt(size,length) _SAL1_Source_(__deref_opt_out_bcount_part_opt, (size,length), __deref_out_bcount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_out_ecount_full_opt(size) _SAL1_Source_(__deref_opt_out_ecount_full_opt, (size), __deref_out_ecount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_out_bcount_full_opt(size) _SAL1_Source_(__deref_opt_out_bcount_full_opt, (size), __deref_out_bcount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_out_z_opt _SAL1_Source_(__deref_opt_out_z_opt, (), __post __deref __valid __refparam __pre_except_maybenull __pre_deref_except_maybenull __post_deref_except_maybenull __post __deref __nullterminated) +#define __deref_opt_out_ecount_z_opt(size) _SAL1_Source_(__deref_opt_out_ecount_z_opt, (size), __deref_opt_out_ecount_opt(size) __post __deref __nullterminated) +#define __deref_opt_out_bcount_z_opt(size) _SAL1_Source_(__deref_opt_out_bcount_z_opt, (size), __deref_opt_out_bcount_opt(size) __post __deref __nullterminated) +#define __deref_opt_out_nz_opt _SAL1_Source_(__deref_opt_out_nz_opt, (), __deref_opt_out_opt) +#define __deref_opt_out_ecount_nz_opt(size) _SAL1_Source_(__deref_opt_out_ecount_nz_opt, (size), __deref_opt_out_ecount_opt(size)) +#define __deref_opt_out_bcount_nz_opt(size) _SAL1_Source_(__deref_opt_out_bcount_nz_opt, (size), __deref_opt_out_bcount_opt(size)) +#define __deref_opt_inout_opt _SAL1_Source_(__deref_opt_inout_opt, (), __deref_inout_opt __pre_except_maybenull) +#define __deref_opt_inout_ecount_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_opt, (size), __deref_inout_ecount_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_opt, (size), __deref_inout_bcount_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_ecount_part_opt(size,length) _SAL1_Source_(__deref_opt_inout_ecount_part_opt, (size,length), __deref_inout_ecount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_inout_bcount_part_opt(size,length) _SAL1_Source_(__deref_opt_inout_bcount_part_opt, (size,length), __deref_inout_bcount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_inout_ecount_full_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_full_opt, (size), __deref_inout_ecount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount_full_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_full_opt, (size), __deref_inout_bcount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_z_opt _SAL1_Source_(__deref_opt_inout_z_opt, (), __deref_opt_inout_opt __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_ecount_z_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_z_opt, (size), __deref_opt_inout_ecount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_bcount_z_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_z_opt, (size), __deref_opt_inout_bcount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_nz_opt _SAL1_Source_(__deref_opt_inout_nz_opt, (), __deref_opt_inout_opt) +#define __deref_opt_inout_ecount_nz_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_nz_opt, (size), __deref_opt_inout_ecount_opt(size)) +#define __deref_opt_inout_bcount_nz_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_nz_opt, (size), __deref_opt_inout_bcount_opt(size)) + +/* +------------------------------------------------------------------------------- +Advanced Annotation Definitions + +Any of these may be used to directly annotate functions, and may be used in +combination with each other or with regular buffer macros. For an explanation +of each annotation, see the advanced annotations section. +------------------------------------------------------------------------------- +*/ + +#define __success(expr) _Success_(expr) +#define __nullterminated _Null_terminated_ +#define __nullnullterminated +#define __clr_reserved _SAL1_Source_(__reserved, (), _Reserved_) +#define __checkReturn _SAL1_Source_(__checkReturn, (), _Check_return_) +#define __typefix(ctype) _SAL1_Source_(__typefix, (ctype), __inner_typefix(ctype)) +#define __override __inner_override +#define __callback __inner_callback +#define __format_string _Printf_format_string_ +#define __blocksOn(resource) __inner_blocksOn(resource) +#define __control_entrypoint(category) __inner_control_entrypoint(category) +#define __data_entrypoint(category) __inner_data_entrypoint(category) +#define __useHeader _Use_decl_anno_impl_ +#define __on_failure(annotes) _On_failure_impl_(annotes _SAL_nop_impl_) + +#ifndef __has_cpp_attribute +#define __has_cpp_attribute(x) (0) +#endif + +#ifndef __fallthrough // [ +#if __has_cpp_attribute(fallthrough) +#define __fallthrough [[fallthrough]] +#else +#define __fallthrough +#endif +#endif // ] + +#ifndef __analysis_assume // [ +#ifdef _PREFAST_ // [ +#define __analysis_assume(expr) __assume(expr) +#else // ][ +#define __analysis_assume(expr) +#endif // ] +#endif // ] + +#ifndef _Analysis_assume_ // [ +#ifdef _PREFAST_ // [ +#define _Analysis_assume_(expr) __assume(expr) +#else // ][ +#define _Analysis_assume_(expr) +#endif // ] +#endif // ] + +#define _Analysis_noreturn_ _SAL2_Source_(_Analysis_noreturn_, (), _SA_annotes0(SAL_terminates)) + +#ifdef _PREFAST_ // [ +__inline __nothrow +void __AnalysisAssumeNullterminated(_Post_ __nullterminated void *p); + +#define _Analysis_assume_nullterminated_(x) __AnalysisAssumeNullterminated(x) +#else // ][ +#define _Analysis_assume_nullterminated_(x) +#endif // ] + +// +// Set the analysis mode (global flags to analysis). +// They take effect at the point of declaration; use at global scope +// as a declaration. +// + +// Synthesize a unique symbol. +#define ___MKID(x, y) x ## y +#define __MKID(x, y) ___MKID(x, y) +#define __GENSYM(x) __MKID(x, __COUNTER__) + +__ANNOTATION(SAL_analysisMode(__AuToQuOtE __In_impl_ char *mode);) + +#define _Analysis_mode_impl_(mode) _SA_annotes1(SAL_analysisMode, #mode) + +#define _Analysis_mode_(mode) \ + typedef _Analysis_mode_impl_(mode) int \ + __GENSYM(__prefast_analysis_mode_flag); + +// The following are predefined: +// _Analysis_operator_new_throw_ (operator new throws) +// _Analysis_operator_new_null_ (operator new returns null) +// _Analysis_operator_new_never_fails_ (operator new never fails) +// + +// Function class annotations. +__ANNOTATION(SAL_functionClassNew(__In_impl_ char*);) +__PRIMOP(int, _In_function_class_(__In_impl_ char*);) +#define _In_function_class_(x) _In_function_class_(#x) + +#define _Function_class_(x) _SA_annotes1(SAL_functionClassNew, #x) + +/* + * interlocked operand used in interlocked instructions + */ +//#define _Interlocked_operand_ _Pre_ _SA_annotes0(SAL_interlocked) + +#define _Enum_is_bitflag_ _SA_annotes0(SAL_enumIsBitflag) +#define _Strict_type_match_ _SA_annotes0(SAL_strictType2) + +#define _Maybe_raises_SEH_exception_ _Pre_ _SA_annotes1(SAL_inTry,__yes) +#define _Raises_SEH_exception_ _Group_(_Maybe_raises_SEH_exception_ _Analysis_noreturn_) + +#ifdef __cplusplus // [ +} +#endif // ] diff --git a/index.idx b/index.idx new file mode 100644 index 0000000..6963a43 Binary files /dev/null and b/index.idx differ diff --git a/position_normal_texture.vtx b/position_normal_texture.vtx new file mode 100644 index 0000000..9c541fc Binary files /dev/null and b/position_normal_texture.vtx differ diff --git a/shader/triangle.hlsl b/shader/triangle.hlsl new file mode 100644 index 0000000..0530835 --- /dev/null +++ b/shader/triangle.hlsl @@ -0,0 +1,24 @@ +static const float2 positions[3] = { + float2(0.0, -0.5), + float2(0.5, 0.5), + float2(-0.5, 0.5) +}; + +struct VSOutput +{ + float4 Pos : SV_POSITION; +}; + +[shader("vertex")] +VSOutput VSMain(uint VertexIndex: SV_VertexID) +{ + VSOutput output = (VSOutput)0; + output.Pos = float4(positions[VertexIndex], 0.0, 1.0); + return output; +} + +[shader("pixel")] +float4 PSMain() : SV_TARGET +{ + return float4(1.0, 0.0, 0.0, 1.0); +} diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..1116884 --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,850 @@ +#include +#include + +#include "volk.h" +#include "vulkan/vk_enum_string_helper.h" +#include "SDL3/SDL.h" +#include "SDL3/SDL_vulkan.h" +#include "directxmath/directxmath.h" + +#include "new.h" + +extern "C" { + extern uint8_t const _binary_position_normal_texture_vtx_start[]; + extern void * const _binary_position_normal_texture_vtx_size; + extern uint8_t const _binary_index_idx_start[]; + extern void * const _binary_index_idx_size; + + extern uint8_t const _binary_shader_triangle_vs_spv_start[]; + extern void * const _binary_shader_triangle_vs_spv_size; + extern uint8_t const _binary_shader_triangle_ps_spv_start[]; + extern void * const _binary_shader_triangle_ps_spv_size; +} + +#define vtx_start _binary_position_normal_texture_vtx_start +#define vtx_size (size_t)(&_binary_position_normal_texture_vtx_size) +#define idx_start _binary_index_idx_start +#define idx_size (size_t)(&_binary_index_idx_size) + +#define vs_start _binary_shader_triangle_vs_spv_start +#define vs_size (size_t)(&_binary_shader_triangle_vs_spv_size) +#define ps_start _binary_shader_triangle_ps_spv_start +#define ps_size (size_t)(&_binary_shader_triangle_ps_spv_size) + +#define SDL_CHECK(f) \ + { \ + bool result = (f); \ + if (result != true) { \ + fprintf(stderr, "SDL: %s %s L%d error: `%s`\n", __FILE__, __func__, __LINE__, SDL_GetError()); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define SDL_CHECK_NONNULL(f) \ + { \ + void * ptr = (void *)(f); \ + if (ptr == nullptr) { \ + fprintf(stderr, "SDL: %s %s L%d error: `%s`\n", __FILE__, __func__, __LINE__, SDL_GetError()); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define VK_CHECK(f) \ + { \ + VkResult result = (f); \ + if (result != VK_SUCCESS) { \ + fprintf(stderr, "VK: %s %s L%d error: `%s`\n", __FILE__, __func__, __LINE__, string_VkResult(result)); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define VK_CHECK_SWAPCHAIN(f) \ + { \ + VkResult result = (f); \ + if (result == VK_ERROR_OUT_OF_DATE_KHR) { \ + updateSwapchain = true; \ + } else if (result != VK_SUCCESS) { \ + fprintf(stderr, "VK: %s %s L%d error: `%s`\n", __FILE__, __func__, __LINE__, string_VkResult(result)); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define ASSERT(expr, msg) \ + { \ + bool result = (expr); \ + if (result != true) { \ + fprintf(stderr, "%s %s L%d error: `%s`\n", __FILE__, __func__, __LINE__, msg); \ + exit(EXIT_FAILURE); \ + } \ + } + +#if defined(_MSC_VER) && !defined(__clang__) // MSVC +#define UNREACHABLE() __assume(false); +#else // GCC, Clang +#define UNREACHABLE() __builtin_unreachable(); +#endif + +constexpr uint32_t maxFramesInFlight{ 2 }; + +VkInstance instance{ VK_NULL_HANDLE }; +VkDevice device{ VK_NULL_HANDLE }; +VkQueue queue{ VK_NULL_HANDLE }; +VkSurfaceKHR surface{ VK_NULL_HANDLE }; +VkSwapchainKHR swapchain{ VK_NULL_HANDLE }; + +VkImage * swapchainImages{ nullptr }; +VkImageView * swapchainImageViews{ nullptr }; + +VkImage depthImage{ VK_NULL_HANDLE }; +VkImageView depthImageView{ VK_NULL_HANDLE }; + +VkBuffer vertexIndexBuffer{ VK_NULL_HANDLE }; + +VkFence fences[maxFramesInFlight]; +VkSemaphore presentSemaphores[maxFramesInFlight]; +VkSemaphore * renderSemaphores{ nullptr }; + +VkCommandPool commandPool{ VK_NULL_HANDLE }; +VkCommandBuffer commandBuffers[maxFramesInFlight]; + +VkPipeline pipeline{ VK_NULL_HANDLE }; +VkPipelineLayout pipelineLayout{ VK_NULL_HANDLE }; + +XMINT2 windowSize{}; + +struct ShaderData { + XMFLOAT4X4 projection; + XMFLOAT4X4 view; + XMFLOAT4X4 model[3]; + XMFLOAT4 lightPos{ 0.0f, -10.0f, 10.0f, 0.0f }; + uint32_t selected{ 1 }; +} shaderData{}; + +struct ShaderDataBuffer { + VkBuffer buffer{ VK_NULL_HANDLE }; + VkDeviceAddress deviceAddress{}; +}; + +ShaderDataBuffer shaderDataBuffers[maxFramesInFlight]; + +static void print_memoryPropertyFlags(VkMemoryPropertyFlags propertyFlags) +{ + int index = 0; + while (propertyFlags) { + if (propertyFlags & 1) { + if (index != 0) + printf("|"); + printf(string_VkMemoryPropertyFlagBits((VkMemoryPropertyFlagBits)(1u << index))); + } + propertyFlags >>= 1; + index += 1; + }; + printf("\n"); +} + +uint32_t findMemoryTypeIndex(VkPhysicalDeviceMemoryProperties const * memoryProperties, uint32_t memoryTypeBits, VkMemoryPropertyFlags propertyFlags) +{ + for (uint32_t i = 0; i < memoryProperties->memoryTypeCount; i++) { + if (!(memoryTypeBits & (1u << i))) + continue; + + if (memoryProperties->memoryTypes[i].propertyFlags == propertyFlags) { + return i; + } + } + ASSERT(false, "no memory type index matching memoryTypeBits and propertyFlags"); + UNREACHABLE(); +} + +int main() +{ + SDL_CHECK(SDL_Init(SDL_INIT_VIDEO)); + SDL_CHECK(SDL_Vulkan_LoadLibrary(NULL)); + volkInitialize(); + + VkApplicationInfo appInfo{ + .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO, + .pApplicationName = "Vulkan", + .apiVersion = VK_API_VERSION_1_3 + }; + + uint32_t instanceExtensionsCount{ 0 }; + char const * const * instanceExtensions{ SDL_Vulkan_GetInstanceExtensions(&instanceExtensionsCount) }; + + VkInstanceCreateInfo instanceCreateInfo{ + .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, + .pApplicationInfo = &appInfo, + .enabledExtensionCount = instanceExtensionsCount, + .ppEnabledExtensionNames = instanceExtensions, + }; + VK_CHECK(vkCreateInstance(&instanceCreateInfo, nullptr, &instance)); + + volkLoadInstance(instance); + + ////////////////////////////////////////////////////////////////////// + // physical device and queue family index + ////////////////////////////////////////////////////////////////////// + + uint32_t physicalDeviceCount{ 0 }; + VK_CHECK(vkEnumeratePhysicalDevices(instance, &physicalDeviceCount, nullptr)); + VkPhysicalDevice * physicalDevices = NewM(physicalDeviceCount); + VK_CHECK(vkEnumeratePhysicalDevices(instance, &physicalDeviceCount, physicalDevices)); + + uint32_t physicalDeviceIndex{ 0 }; + printf("physicalDeviceCount %d\n", physicalDeviceCount); + for (uint32_t i = 0; i < physicalDeviceCount; i++) { + VkPhysicalDeviceProperties2 properties{ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2 }; + vkGetPhysicalDeviceProperties2(physicalDevices[i], &properties); + printf("devices[%d] name: %s%s\n", i, properties.properties.deviceName, (i == physicalDeviceIndex) ? " [selected]" : ""); + if (i == physicalDeviceIndex) { + printf("limits:\n"); + printf(" maxImageDimension1D %u\n", properties.properties.limits.maxImageDimension1D); + printf(" maxImageDimension2D %u\n", properties.properties.limits.maxImageDimension2D); + printf(" maxMemoryAllocationCount %u\n", properties.properties.limits.maxMemoryAllocationCount); + printf(" maxSamplerAllocationCount %u\n", properties.properties.limits.maxSamplerAllocationCount); + } + } + VkPhysicalDevice physicalDevice = physicalDevices[physicalDeviceIndex]; + free(physicalDevices); + + uint32_t queueFamilyCount{ 0 }; + vkGetPhysicalDeviceQueueFamilyProperties2(physicalDevice, &queueFamilyCount, nullptr); + VkQueueFamilyProperties2 * queueFamilyProperties = NewM(queueFamilyCount); + for (uint32_t i = 0; i < queueFamilyCount; i++) { + queueFamilyProperties[i].sType = VK_STRUCTURE_TYPE_QUEUE_FAMILY_PROPERTIES_2; + queueFamilyProperties[i].pNext = nullptr; + } + vkGetPhysicalDeviceQueueFamilyProperties2(physicalDevice, &queueFamilyCount, queueFamilyProperties); + uint32_t queueFamilyIndex{ ~0u }; + for (uint32_t i = 0; i < queueFamilyCount; i++) { + VkQueueFlags queueFlags = queueFamilyProperties[i].queueFamilyProperties.queueFlags; + if ((queueFlags & VK_QUEUE_GRAPHICS_BIT) && (queueFlags & VK_QUEUE_COMPUTE_BIT)) { + queueFamilyIndex = i; + break; + } + } + ASSERT(queueFamilyIndex != ~0u, "no queue with VK_QUEUE_GRAPHICS_BIT && VK_QUEUE_COMPUTE_BIT"); + free(queueFamilyProperties); + + SDL_CHECK(SDL_Vulkan_GetPresentationSupport(instance, physicalDevice, queueFamilyIndex)); + + ////////////////////////////////////////////////////////////////////// + // memory + ////////////////////////////////////////////////////////////////////// + + VkPhysicalDeviceMemoryProperties2 memoryProperties{ + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2, + .pNext = nullptr + }; + vkGetPhysicalDeviceMemoryProperties2(physicalDevice, &memoryProperties); + /* + for (uint32_t i = 0; i < memoryProperties.memoryProperties.memoryTypeCount; i++) { + printf("memoryTypes[%u].propertyFlags: ", i); + print_memoryPropertyFlags(memoryProperties.memoryProperties.memoryTypes[i].propertyFlags); + printf("memoryTypes[%u].heapIndex: %u\n", i, memoryProperties.memoryProperties.memoryTypes[i].heapIndex); + } + for (uint32_t i = 0; i < memoryProperties.memoryProperties.memoryHeapCount; i++) { + printf("memoryHeaps[%u].size %lu\n", i, memoryProperties.memoryProperties.memoryHeaps[i].size); + printf("memoryHeaps[%u].flags %08x\n", i, memoryProperties.memoryProperties.memoryHeaps[i].flags); + } + */ + + ////////////////////////////////////////////////////////////////////// + // device and queue + ////////////////////////////////////////////////////////////////////// + + const float queuePriorities{ 1.0f }; + VkDeviceQueueCreateInfo queueCreateInfo{ + .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, + .queueFamilyIndex = queueFamilyIndex, + .queueCount = 1, + .pQueuePriorities = &queuePriorities + }; + VkPhysicalDeviceVulkan12Features enabledVulkan12Features{ + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES, + .descriptorIndexing = true, + .shaderSampledImageArrayNonUniformIndexing = true, + .descriptorBindingVariableDescriptorCount = true, + .runtimeDescriptorArray = true, + .bufferDeviceAddress = true + }; + VkPhysicalDeviceVulkan13Features enabledVulkan13Features{ + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES, + .pNext = &enabledVulkan12Features, + .synchronization2 = true, + .dynamicRendering = true, + }; + VkPhysicalDeviceFeatures enabledFeatures{ + .samplerAnisotropy = VK_TRUE + }; + constexpr uint32_t enabledExtensionCount = 1; + char const * enabledExtensionNames[enabledExtensionCount]{ VK_KHR_SWAPCHAIN_EXTENSION_NAME }; + VkDeviceCreateInfo deviceCreateInfo{ + .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, + .pNext = &enabledVulkan13Features, + .queueCreateInfoCount = 1, + .pQueueCreateInfos = &queueCreateInfo, + .enabledExtensionCount = enabledExtensionCount, + .ppEnabledExtensionNames = enabledExtensionNames, + .pEnabledFeatures = &enabledFeatures + }; + VK_CHECK(vkCreateDevice(physicalDevice, &deviceCreateInfo, nullptr, &device)); + vkGetDeviceQueue(device, queueFamilyIndex, 0, &queue); + + ////////////////////////////////////////////////////////////////////// + // window and surface + ////////////////////////////////////////////////////////////////////// + + SDL_Window * window = SDL_CreateWindow("Vulkan", 1024, 1024, SDL_WINDOW_VULKAN | SDL_WINDOW_RESIZABLE); + SDL_CHECK_NONNULL(window); + SDL_CHECK(SDL_Vulkan_CreateSurface(window, instance, nullptr, &surface)); + SDL_CHECK(SDL_GetWindowSize(window, &windowSize.x, &windowSize.y)); + VkSurfaceCapabilitiesKHR surfaceCapabilities{}; + VK_CHECK(vkGetPhysicalDeviceSurfaceCapabilitiesKHR(physicalDevice, surface, &surfaceCapabilities)); + printf("surfaceCapabilities currentExtent %d %d\n", surfaceCapabilities.currentExtent.width, surfaceCapabilities.currentExtent.height); + uint32_t surfaceFormatCount{ 0 }; + VK_CHECK(vkGetPhysicalDeviceSurfaceFormatsKHR(physicalDevice, surface, &surfaceFormatCount, nullptr)); + VkSurfaceFormatKHR * surfaceFormats = NewM(surfaceFormatCount); + VK_CHECK(vkGetPhysicalDeviceSurfaceFormatsKHR(physicalDevice, surface, &surfaceFormatCount, surfaceFormats)); + uint32_t surfaceFormatIndex{ 0 }; + printf("surfaceFormatCount %d\n", surfaceFormatCount); + for (uint32_t i = 0; i < surfaceFormatCount; i++) { + printf("surfaceFormat[%d] %s %s%s\n", i, string_VkFormat(surfaceFormats[i].format), string_VkColorSpaceKHR(surfaceFormats[i].colorSpace), (i == surfaceFormatIndex) ? " [selected]" : ""); + } + + ////////////////////////////////////////////////////////////////////// + // swapchain and images + ////////////////////////////////////////////////////////////////////// + + VkExtent2D imageExtent { + .width = surfaceCapabilities.currentExtent.width, + .height = surfaceCapabilities.currentExtent.height, + }; + VkFormat imageFormat{ surfaceFormats[surfaceFormatIndex].format }; + VkColorSpaceKHR imageColorSpace{ surfaceFormats[surfaceFormatIndex].colorSpace }; + free(surfaceFormats); + VkSwapchainCreateInfoKHR swapchainCreateInfo{ + .sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR, + .surface = surface, + .minImageCount = surfaceCapabilities.minImageCount, + .imageFormat = imageFormat, + .imageColorSpace = imageColorSpace, + .imageExtent = imageExtent, + .imageArrayLayers = 1, + .imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, + .preTransform = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR, + .compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR, + .presentMode = VK_PRESENT_MODE_FIFO_KHR + }; + VK_CHECK(vkCreateSwapchainKHR(device, &swapchainCreateInfo, nullptr, &swapchain)); + + uint32_t swapchainImageCount{ 0 }; + VK_CHECK(vkGetSwapchainImagesKHR(device, swapchain, &swapchainImageCount, nullptr)); + swapchainImages = NewM(swapchainImageCount); + VK_CHECK(vkGetSwapchainImagesKHR(device, swapchain, &swapchainImageCount, swapchainImages)); + swapchainImageViews = NewM(swapchainImageCount); + for (uint32_t i = 0; i < swapchainImageCount; i++) { + VkImageViewCreateInfo imageViewCreateInfo{ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = swapchainImages[i], + .viewType = VK_IMAGE_VIEW_TYPE_2D, + .format = imageFormat, + .subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .levelCount = 1, + .layerCount = 1 + } + }; + VK_CHECK(vkCreateImageView(device, &imageViewCreateInfo, nullptr, &swapchainImageViews[i])); + } + + ////////////////////////////////////////////////////////////////////// + // depth + ////////////////////////////////////////////////////////////////////// + + VkFormat depthFormat{ VK_FORMAT_UNDEFINED }; + constexpr uint32_t depthFormatCount = 2; + VkFormat depthFormatList[depthFormatCount]{ VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D24_UNORM_S8_UINT }; + for (uint32_t i = 0; i < depthFormatCount; i++) { + VkFormatProperties2 formatProperties{ .sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2 }; + vkGetPhysicalDeviceFormatProperties2(physicalDevice, depthFormatList[i], &formatProperties); + if (formatProperties.formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT) { + depthFormat = depthFormatList[i]; + break; + } + } + ASSERT(depthFormat != VK_FORMAT_UNDEFINED, "no depth format with VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT"); + printf("depthFormat: %s\n", string_VkFormat(depthFormat)); + + VkImageCreateInfo depthImageCreateInfo{ + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .imageType = VK_IMAGE_TYPE_2D, + .format = depthFormat, + .extent{ + .width = imageExtent.width, + .height = imageExtent.height, + .depth = 1, + }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + }; + + VK_CHECK(vkCreateImage(device, &depthImageCreateInfo, nullptr, &depthImage)); + + VkMemoryRequirements depthImageMemoryRequirements; + vkGetImageMemoryRequirements(device, depthImage, &depthImageMemoryRequirements); + VkMemoryPropertyFlags depthImageMemoryPropertyFlags{ + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT + }; + uint32_t depthImageMemoryTypeIndex = findMemoryTypeIndex(&memoryProperties.memoryProperties, depthImageMemoryRequirements.memoryTypeBits, depthImageMemoryPropertyFlags); + VkMemoryAllocateInfo depthImageAllocateInfo{ + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = depthImageMemoryRequirements.size, + .memoryTypeIndex = depthImageMemoryTypeIndex, + }; + VkDeviceMemory depthImageMemory; + VK_CHECK(vkAllocateMemory(device, &depthImageAllocateInfo, nullptr, &depthImageMemory)); + VK_CHECK(vkBindImageMemory(device, depthImage, depthImageMemory, 0)); + + VkImageViewCreateInfo depthViewCreateInfo{ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = depthImage, + .viewType = VK_IMAGE_VIEW_TYPE_2D, + .format = depthFormat, + .subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT, + .levelCount = 1, + .layerCount = 1 + } + }; + VK_CHECK(vkCreateImageView(device, &depthViewCreateInfo, nullptr, &depthImageView)); + + ////////////////////////////////////////////////////////////////////// + // mesh + ////////////////////////////////////////////////////////////////////// + + VkDeviceSize vtxBufferSize{ vtx_size }; + VkDeviceSize idxBufferSize{ idx_size }; + VkBufferCreateInfo vertexIndexBufferCreateInfo{ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .size = vtxBufferSize + idxBufferSize, + .usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE + }; + VK_CHECK(vkCreateBuffer(device, &vertexIndexBufferCreateInfo, nullptr, &vertexIndexBuffer)); + + VkMemoryRequirements vertexIndexBufferMemoryRequirements; + vkGetBufferMemoryRequirements(device, vertexIndexBuffer, &vertexIndexBufferMemoryRequirements); + VkMemoryPropertyFlags vertexIndexBufferMemoryPropertyFlags{ + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT + }; + uint32_t vertexIndexBufferMemoryTypeIndex = findMemoryTypeIndex(&memoryProperties.memoryProperties, vertexIndexBufferMemoryRequirements.memoryTypeBits, vertexIndexBufferMemoryPropertyFlags); + VkMemoryAllocateInfo vertexIndexBufferAllocateInfo{ + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = vertexIndexBufferMemoryRequirements.size, + .memoryTypeIndex = vertexIndexBufferMemoryTypeIndex, + }; + VkDeviceMemory vertexIndexBufferMemory; + VK_CHECK(vkAllocateMemory(device, &vertexIndexBufferAllocateInfo, nullptr, &vertexIndexBufferMemory)); + VK_CHECK(vkBindBufferMemory(device, vertexIndexBuffer, vertexIndexBufferMemory, 0)); + + void * mappedData; + VK_CHECK(vkMapMemory(device, vertexIndexBufferMemory, 0, vertexIndexBufferCreateInfo.size, 0, &mappedData)); + memcpy((void *)(((ptrdiff_t)mappedData) + 0), vtx_start, vtx_size); + memcpy((void *)(((ptrdiff_t)mappedData) + vtx_size), idx_start, idx_size); + vkUnmapMemory(device, vertexIndexBufferMemory); + + ////////////////////////////////////////////////////////////////////// + // shader buffers + ////////////////////////////////////////////////////////////////////// + + for (uint32_t i = 0; i < maxFramesInFlight; i++) { + VkBufferCreateInfo shaderBufferCreateInfo{ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .size = sizeof(ShaderData), + .usage = VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE + }; + + VK_CHECK(vkCreateBuffer(device, &shaderBufferCreateInfo, nullptr, &shaderDataBuffers[i].buffer)); + + VkMemoryRequirements shaderBufferMemoryRequirements; + vkGetBufferMemoryRequirements(device, shaderDataBuffers[i].buffer, &shaderBufferMemoryRequirements); + + VkMemoryPropertyFlags shaderBufferMemoryPropertyFlags{ + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT + }; + uint32_t shaderBufferMemoryTypeIndex = findMemoryTypeIndex(&memoryProperties.memoryProperties, depthImageMemoryRequirements.memoryTypeBits, shaderBufferMemoryPropertyFlags); + VkMemoryAllocateFlagsInfo shaderBufferAllocateFlagsInfo{ + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, + .flags = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT, + }; + VkMemoryAllocateInfo shaderBufferAllocateInfo{ + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .pNext = &shaderBufferAllocateFlagsInfo, + .allocationSize = shaderBufferMemoryRequirements.size, + .memoryTypeIndex = shaderBufferMemoryTypeIndex, + }; + VkDeviceMemory shaderBufferMemory; + VK_CHECK(vkAllocateMemory(device, &shaderBufferAllocateInfo, nullptr, &shaderBufferMemory)); + VK_CHECK(vkBindBufferMemory(device, shaderDataBuffers[i].buffer, shaderBufferMemory, 0)); + + VkBufferDeviceAddressInfo shaderBufferDeviceAddressInfo{ + .sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO, + .buffer = shaderDataBuffers[i].buffer + }; + shaderDataBuffers[i].deviceAddress = vkGetBufferDeviceAddress(device, &shaderBufferDeviceAddressInfo); + } + + ////////////////////////////////////////////////////////////////////// + // synchronization objects + ////////////////////////////////////////////////////////////////////// + + VkSemaphoreCreateInfo semaphoreCreateInfo{ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO + }; + VkFenceCreateInfo fenceCreateInfo{ + .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, + .flags = VK_FENCE_CREATE_SIGNALED_BIT + }; + for (uint32_t i = 0; i < maxFramesInFlight; i++) { + VK_CHECK(vkCreateFence(device, &fenceCreateInfo, nullptr, &fences[i])); + VK_CHECK(vkCreateSemaphore(device, &semaphoreCreateInfo, nullptr, &presentSemaphores[i])); + } + renderSemaphores = NewM(swapchainImageCount); + for (uint32_t i = 0; i < swapchainImageCount; i++) { + VK_CHECK(vkCreateSemaphore(device, &semaphoreCreateInfo, nullptr, &renderSemaphores[i])); + } + + ////////////////////////////////////////////////////////////////////// + // command + ////////////////////////////////////////////////////////////////////// + + VkCommandPoolCreateInfo commandPoolCreateInfo{ + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, + .queueFamilyIndex = queueFamilyIndex + }; + VK_CHECK(vkCreateCommandPool(device, &commandPoolCreateInfo, nullptr, &commandPool)); + + VkCommandBufferAllocateInfo commandBufferAllocateCreateInfo{ + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .commandPool = commandPool, + .commandBufferCount = maxFramesInFlight + }; + VK_CHECK(vkAllocateCommandBuffers(device, &commandBufferAllocateCreateInfo, commandBuffers)); + + ////////////////////////////////////////////////////////////////////// + // shaders + ////////////////////////////////////////////////////////////////////// + + VkShaderModuleCreateInfo vertexShaderModuleCreateInfo{ + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + .codeSize = vs_size, + .pCode = (uint32_t *)vs_start + }; + VkShaderModule vertexShaderModule{}; + VK_CHECK(vkCreateShaderModule(device, &vertexShaderModuleCreateInfo, nullptr, &vertexShaderModule)); + + VkShaderModuleCreateInfo pixelShaderModuleCreateInfo{ + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + .codeSize = ps_size, + .pCode = (uint32_t *)ps_start + }; + VkShaderModule pixelShaderModule{}; + VK_CHECK(vkCreateShaderModule(device, &pixelShaderModuleCreateInfo, nullptr, &pixelShaderModule)); + + ////////////////////////////////////////////////////////////////////// + // pipeline + ////////////////////////////////////////////////////////////////////// + + VkPushConstantRange pushConstantRange{ + .stageFlags = VK_SHADER_STAGE_VERTEX_BIT, + .size = sizeof(VkDeviceAddress) + }; + VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 0, + .pSetLayouts = nullptr, + //.setLayoutCount = 1, + //.pSetLayouts = &descriptorSetLayoutTex, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &pushConstantRange + }; + VK_CHECK(vkCreatePipelineLayout(device, &pipelineLayoutCreateInfo, nullptr, &pipelineLayout)); + + VkVertexInputBindingDescription vertexBindingDescriptions[1]{ + { + .binding = 0, + .stride = (3 * (sizeof (float)) * 3), + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX + } + }; + VkVertexInputAttributeDescription vertexAttributeDescriptions[3]{ + { .location = 0, .binding = 0, .format = VK_FORMAT_R32G32B32_SFLOAT, .offset = 3 * 4 * 0 }, + { .location = 1, .binding = 0, .format = VK_FORMAT_R32G32B32_SFLOAT, .offset = 3 * 4 * 1 }, + { .location = 2, .binding = 0, .format = VK_FORMAT_R32G32B32_SFLOAT, .offset = 3 * 4 * 2 }, + }; + VkPipelineVertexInputStateCreateInfo vertexInputState{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = vertexBindingDescriptions, + .vertexAttributeDescriptionCount = 3, + .pVertexAttributeDescriptions = vertexAttributeDescriptions, + }; + + VkPipelineInputAssemblyStateCreateInfo inputAssemblyState{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST + }; + + VkPipelineShaderStageCreateInfo shaderStages[2]{ + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = vertexShaderModule, .pName = "VSMain" + }, + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = pixelShaderModule, .pName = "PSMain" + } + }; + + VkPipelineViewportStateCreateInfo viewportState{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 1, + .scissorCount = 1 + }; + + VkDynamicState dynamicStates[2]{ + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR + }; + VkPipelineDynamicStateCreateInfo dynamicState{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .dynamicStateCount = 2, + .pDynamicStates = dynamicStates + }; + + VkPipelineDepthStencilStateCreateInfo depthStencilState{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .depthTestEnable = VK_TRUE, + .depthWriteEnable = VK_TRUE, + .depthCompareOp = VK_COMPARE_OP_LESS_OR_EQUAL + }; + + VkPipelineRenderingCreateInfo renderingCreateInfo{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO, + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &imageFormat, + .depthAttachmentFormat = depthFormat + }; + + VkPipelineColorBlendAttachmentState blendAttachment{ + .colorWriteMask = 0xF + }; + VkPipelineColorBlendStateCreateInfo colorBlendState{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &blendAttachment + }; + VkPipelineRasterizationStateCreateInfo rasterizationState{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .lineWidth = 1.0f + }; + VkPipelineMultisampleStateCreateInfo multisampleState{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT + }; + + VkGraphicsPipelineCreateInfo pipelineCreateInfo{ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = &renderingCreateInfo, + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputState, + .pInputAssemblyState = &inputAssemblyState, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizationState, + .pMultisampleState = &multisampleState, + .pDepthStencilState = &depthStencilState, + .pColorBlendState = &colorBlendState, + .pDynamicState = &dynamicState, + .layout = pipelineLayout + }; + VK_CHECK(vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, &pipelineCreateInfo, nullptr, &pipeline)); + + ////////////////////////////////////////////////////////////////////// + // loop + ////////////////////////////////////////////////////////////////////// + + bool updateSwapchain{ false }; + uint32_t frameIndex{ 0 }; + uint32_t imageIndex{ 0 }; + bool quit{ false }; + while (quit == false) { + SDL_Event event; + while (SDL_PollEvent(&event)) { + if (event.type == SDL_EVENT_QUIT) { + quit = true; + } + } + + // wait for fence + VK_CHECK(vkWaitForFences(device, 1, &fences[frameIndex], true, UINT64_MAX)); + VK_CHECK(vkResetFences(device, 1, &fences[frameIndex])); + + // acquire next image + VK_CHECK_SWAPCHAIN(vkAcquireNextImageKHR(device, swapchain, UINT64_MAX, presentSemaphores[frameIndex], VK_NULL_HANDLE, &imageIndex)); + + // command buffer + VkCommandBuffer& commandBuffer = commandBuffers[frameIndex]; + VK_CHECK(vkResetCommandBuffer(commandBuffer, 0)); + VkCommandBufferBeginInfo commandBufferBeginInfo { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT + }; + VK_CHECK(vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo)); + + VkImageMemoryBarrier2 outputBarriers[2]{ + VkImageMemoryBarrier2{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + .srcAccessMask = 0, + .dstStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .newLayout = VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL, + .image = swapchainImages[imageIndex], + .subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .levelCount = 1, + .layerCount = 1 + } + }, + VkImageMemoryBarrier2{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + .srcStageMask = VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT, + .srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT, + .dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .newLayout = VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL, + .image = depthImage, + .subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, + .levelCount = 1, + .layerCount = 1 + } + } + }; + VkDependencyInfo barrierDependencyInfo{ + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .imageMemoryBarrierCount = 2, + .pImageMemoryBarriers = outputBarriers + }; + vkCmdPipelineBarrier2(commandBuffer, &barrierDependencyInfo); + + VkRenderingAttachmentInfo colorRenderingAttachmentInfo{ + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + .imageView = swapchainImageViews[imageIndex], + .imageLayout = VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL, + .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .clearValue{ .color{ 0.0f, 0.0f, 0.2f, 1.0f } } + }; + VkRenderingAttachmentInfo depthRenderingAttachmentInfo{ + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + .imageView = depthImageView, + .imageLayout = VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL, + .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR, + .storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE, + .clearValue{ .depthStencil{ 1.0f, 0 } } + }; + + VkRenderingInfo renderingInfo{ + .sType = VK_STRUCTURE_TYPE_RENDERING_INFO, + .renderArea{ .extent{ .width = (uint32_t)windowSize.x, .height = (uint32_t)windowSize.y } }, + .layerCount = 1, + .colorAttachmentCount = 1, + .pColorAttachments = &colorRenderingAttachmentInfo, + .pDepthAttachment = &depthRenderingAttachmentInfo + }; + vkCmdBeginRendering(commandBuffer, &renderingInfo); + + VkViewport viewport{ + .width = static_cast(windowSize.x), + .height = static_cast(windowSize.y), + .minDepth = 0.0f, + .maxDepth = 1.0f + }; + vkCmdSetViewport(commandBuffer, 0, 1, &viewport); + VkRect2D scissor{ .extent{ .width = (uint32_t)windowSize.x, .height = (uint32_t)windowSize.y } }; + vkCmdSetScissor(commandBuffer, 0, 1, &scissor); + + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); + VkDeviceSize vertexOffset{ 0 }; + //vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &descriptorSetTex, 0, nullptr); + vkCmdBindVertexBuffers(commandBuffer, 0, 1, &vertexIndexBuffer, &vertexOffset); + VkDeviceSize indexOffset{ vtxBufferSize }; + vkCmdBindIndexBuffer(commandBuffer, vertexIndexBuffer, indexOffset, VK_INDEX_TYPE_UINT32); + vkCmdPushConstants(commandBuffer, pipelineLayout, VK_SHADER_STAGE_VERTEX_BIT, 0, sizeof(VkDeviceAddress), &shaderDataBuffers[frameIndex].deviceAddress); + VkDeviceSize indexCount{ 3 }; + vkCmdDrawIndexed(commandBuffer, indexCount, 3, 0, 0, 0); + vkCmdEndRendering(commandBuffer); + + VkImageMemoryBarrier2 barrierPresent{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + .srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = 0, + .oldLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, + .image = swapchainImages[imageIndex], + .subresourceRange{ .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, .levelCount = 1, .layerCount = 1 } + }; + VkDependencyInfo barrierPresentDependencyInfo{ + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .imageMemoryBarrierCount = 1, + .pImageMemoryBarriers = &barrierPresent + }; + vkCmdPipelineBarrier2(commandBuffer, &barrierPresentDependencyInfo); + + VK_CHECK(vkEndCommandBuffer(commandBuffer)); + + // submit to graphics queue + VkPipelineStageFlags waitStages = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + VkSubmitInfo submitInfo{ + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .waitSemaphoreCount = 1, + .pWaitSemaphores = &presentSemaphores[frameIndex], + .pWaitDstStageMask = &waitStages, + .commandBufferCount = 1, + .pCommandBuffers = &commandBuffer, + .signalSemaphoreCount = 1, + .pSignalSemaphores = &renderSemaphores[imageIndex], + }; + VK_CHECK(vkQueueSubmit(queue, 1, &submitInfo, fences[frameIndex])); + + frameIndex = (frameIndex + 1) % maxFramesInFlight; + + // present + VkPresentInfoKHR presentInfo{ + .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR, + .waitSemaphoreCount = 1, + .pWaitSemaphores = &renderSemaphores[imageIndex], + .swapchainCount = 1, + .pSwapchains = &swapchain, + .pImageIndices = &imageIndex + }; + VK_CHECK_SWAPCHAIN(vkQueuePresentKHR(queue, &presentInfo)); + + assert(updateSwapchain == 0); + } +}