diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 96ef22dd13..a6c8fd17ba 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -1261,7 +1261,14 @@ struct ByteAddressBuffer memcpy(&data, ((const char*)this->data) + index, sizeof(T)); return data; } - + template + SLANG_CUDA_CALL StructuredBuffer asStructuredBuffer() const + { + StructuredBuffer rs; + rs.data = (T*)data; + rs.count = sizeInBytes / sizeof(T); + return rs; + } const uint32_t* data; size_t sizeInBytes; //< Must be multiple of 4 }; @@ -1348,7 +1355,14 @@ struct RWByteAddressBuffer SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes); return (T*)(((char*)data) + index); } - + template + SLANG_CUDA_CALL RWStructuredBuffer asStructuredBuffer() const + { + RWStructuredBuffer rs; + rs.data = (T*)data; + rs.count = sizeInBytes / sizeof(T); + return rs; + } uint32_t* data; size_t sizeInBytes; //< Must be multiple of 4 }; diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang index 084654d0fe..67ec91cf63 100644 --- a/source/slang/core.meta.slang +++ b/source/slang/core.meta.slang @@ -299,6 +299,18 @@ interface __BuiltinSignedArithmeticType : __BuiltinArithmeticType {} interface __BuiltinIntegerType : __BuiltinArithmeticType, IInteger {} +/// Represent a `int` or `uint` type. +[sealed] +[builtin] +interface __BuiltinInt32Type : __BuiltinIntegerType +{} + +/// Represent a `int64_t` or `uint64_t` type. +[sealed] +[builtin] +interface __BuiltinInt64Type : __BuiltinIntegerType +{} + /// Represent builtin types that can represent a real number. [sealed] [builtin] @@ -602,6 +614,14 @@ ${{{{ }}}} , __BuiltinArithmeticType , __BuiltinIntegerType +${{{{ + if (kBaseTypes[tt].tag == BaseType::Int || kBaseTypes[tt].tag == BaseType::UInt) +}}}} + , __BuiltinInt32Type +${{{{ + if (kBaseTypes[tt].tag == BaseType::Int64 || kBaseTypes[tt].tag == BaseType::UInt64) +}}}} + , __BuiltinInt64Type ${{{{ ; // fall through case BaseType::Bool: diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 191fa31958..1c01c2f6bd 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -3923,6746 +3923,5061 @@ ${{{{ } }}}} -// AtomicAdd -// Make the GLSL atomicAdd available. -// We have separate int/float implementations, as the float version requires some specific extensions -// https://www.khronos.org/registry/OpenGL/extensions/NV/NV_shader_atomic_float.txt +// Atomic intrinsic insts. + +__intrinsic_op($(kIROp_AtomicExchange)) +T __atomic_exchange(__ref T val, T newValue, MemoryOrder order = MemoryOrder.Relaxed); +__intrinsic_op($(kIROp_AtomicCompareExchange)) +T __atomic_compare_exchange( + __ref T val, + T compareValue, + T newValue, + MemoryOrder successOrder = MemoryOrder.Relaxed, + MemoryOrder failOrder = MemoryOrder.Relaxed); +__intrinsic_op($(kIROp_AtomicAdd)) +T __atomic_add(__ref T val, T value, MemoryOrder order = MemoryOrder.Relaxed); +__intrinsic_op($(kIROp_AtomicSub)) +T __atomic_sub(__ref T val, T value, MemoryOrder order = MemoryOrder.Relaxed); +__intrinsic_op($(kIROp_AtomicMax)) +T __atomic_max(__ref T val, T value, MemoryOrder order = MemoryOrder.Relaxed); +__intrinsic_op($(kIROp_AtomicMin)) +T __atomic_min(__ref T val, T value, MemoryOrder order = MemoryOrder.Relaxed); +__intrinsic_op($(kIROp_AtomicAnd)) +T __atomic_and(__ref T val, T value, MemoryOrder order = MemoryOrder.Relaxed); +__intrinsic_op($(kIROp_AtomicOr)) +T __atomic_or(__ref T val, T value, MemoryOrder order = MemoryOrder.Relaxed); +__intrinsic_op($(kIROp_AtomicXor)) +T __atomic_xor(__ref T val, T value, MemoryOrder order = MemoryOrder.Relaxed); +__intrinsic_op($(kIROp_AtomicInc)) +T __atomic_increment(__ref T val, MemoryOrder order = MemoryOrder.Relaxed); +__intrinsic_op($(kIROp_AtomicDec)) +T __atomic_decrement(__ref T val, MemoryOrder order = MemoryOrder.Relaxed); -__glsl_version(430) -__glsl_extension(GL_EXT_shader_atomic_float) -[ForceInline] -[require(glsl_spirv, atomic_glsl_float1)] -float __atomicAdd(__ref float value, float amount) -{ - __target_switch - { - case glsl: __intrinsic_asm "atomicAdd($0, $1)"; - case spirv: - return spirv_asm - { - OpExtension "SPV_EXT_shader_atomic_float_add"; - OpCapability AtomicFloat32AddEXT; - result:$$float = OpAtomicFAddEXT &value Device None $amount - }; - } -} +// Conversion between uint64_t and uint2 -__glsl_version(430) -__glsl_extension(GL_NV_shader_atomic_fp16_vector) -[ForceInline] -[require(glsl_spirv, atomic_glsl_halfvec)] -half2 __atomicAdd(__ref half2 value, half2 amount) +[require(cpp_cuda_glsl_hlsl_spirv, sm_4_0_version)] +uint2 __asuint2(uint64_t i) { - __target_switch - { - case glsl: __intrinsic_asm "atomicAdd($0, $1)"; - case spirv: - return spirv_asm - { - OpExtension "SPV_EXT_shader_atomic_float_add"; - OpCapability AtomicFloat32AddEXT; - result:$$half2 = OpAtomicFAddEXT &value Device None $amount - }; - } + return uint2(uint(i), uint(uint64_t(i) >> 32)); } -// Helper for hlsl, using NVAPI -[__requiresNVAPI] -[require(hlsl, atomic_hlsl_nvapi)] -uint2 __atomicAdd(RWByteAddressBuffer buf, uint offset, uint2) +[require(cpp_cuda_glsl_hlsl_spirv, sm_4_0_version)] +uint64_t __asuint64(uint2 i) { - __target_switch - { - case hlsl: __intrinsic_asm "NvInterlockedAddUint64($0, $1, $2)"; - } + return (uint64_t(i.y) << 32) | i.x; } -// atomic add for hlsl using SM6.6 -[require(hlsl, atomic_hlsl_sm_6_6)] -void __atomicAdd(RWByteAddressBuffer buf, uint offset, int64_t value, out int64_t originalValue) -{ - __target_switch - { - case hlsl: __intrinsic_asm "$0.InterlockedAdd64($1, $2, $3)"; - } -} +// -[require(hlsl, atomic_hlsl_sm_6_6)] -void __atomicAdd(RWByteAddressBuffer buf, uint offset, uint64_t value, out uint64_t originalValue) -{ - __target_switch - { - case hlsl: __intrinsic_asm "$0.InterlockedAdd64($1, $2, $3)"; - } -} +__intrinsic_op($(kIROp_ByteAddressBufferLoad)) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, byteaddressbuffer)] +T __byteAddressBufferLoad(ByteAddressBuffer buffer, int offset, int alignment); -// Int versions require glsl 4.30 -// https://www.khronos.org/registry/OpenGL-Refpages/gl4/html/atomicAdd.xhtml +__intrinsic_op($(kIROp_ByteAddressBufferLoad)) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, byteaddressbuffer_rw)] +T __byteAddressBufferLoad(RWByteAddressBuffer buffer, int offset, int alignment); -__glsl_version(430) -[ForceInline] -[require(glsl_spirv, atomic_glsl)] -int __atomicAdd(__ref int value, int amount) -{ - __target_switch - { - case glsl: __intrinsic_asm "atomicAdd($0, $1)"; - case spirv: - return spirv_asm - { - result:$$int = OpAtomicIAdd &value Device None $amount; - }; - } -} +__intrinsic_op($(kIROp_ByteAddressBufferLoad)) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, byteaddressbuffer_rw)] +T __byteAddressBufferLoad(RasterizerOrderedByteAddressBuffer buffer, int offset, int alignment); -__glsl_version(430) -[ForceInline] -[require(glsl_spirv, atomic_glsl)] -uint __atomicAdd(__ref uint value, uint amount) -{ - __target_switch - { - case glsl: __intrinsic_asm "atomicAdd($0, $1)"; - case spirv: - return spirv_asm - { - result:$$uint = OpAtomicIAdd &value Device None $amount; - }; - } -} +__intrinsic_op($(kIROp_ByteAddressBufferStore)) +[require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] +void __byteAddressBufferStore(RWByteAddressBuffer buffer, int offset, int alignment, T value); -__glsl_version(430) -__glsl_extension(GL_EXT_shader_atomic_int64) -[ForceInline] -[require(glsl_spirv, atomic_glsl_int64)] -int64_t __atomicAdd(__ref int64_t value, int64_t amount) -{ - __target_switch - { - case glsl: __intrinsic_asm "atomicAdd($0, $1)"; - case spirv: - return spirv_asm - { - OpCapability Int64Atomics; - result:$$int64_t = OpAtomicIAdd &value Device None $amount - }; - } -} +__intrinsic_op($(kIROp_ByteAddressBufferStore)) +[require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] +void __byteAddressBufferStore(RasterizerOrderedByteAddressBuffer buffer, int offset, int alignment, T value); -__glsl_version(430) -__glsl_extension(GL_EXT_shader_atomic_int64) -[ForceInline] -[require(glsl_spirv, atomic_glsl_int64)] -uint64_t __atomicAdd(__ref uint64_t value, uint64_t amount) +/** +Represents an opaque handle to a read-only structured buffer allocated in global memory. +A structured buffer can be viewed as an array of the specified element type. +@param T The element type of the buffer. +@param L The memory layout of the buffer. +@remarks +The `L` generic parameter is used to specify the memory layout of the buffer when +generating SPIRV. +`L` must be one of `DefaultDataLayout`, `Std140DataLayout`, `Std430DataLayout` or `ScalarDataLayout`. +The default value is `DefaultDataLayout`. +When generating code for other targets, this parameter is ignored and has no effect on the generated code. +@see `RWStructuredBuffer`, `AppendStructuredBuffer`, `ConsumeStructuredBuffer`, `RasterizerOrderedStructuredBuffer`. +@category buffer_types Buffer types +**/ +__generic +__magic_type(HLSLStructuredBufferType) +__intrinsic_type($(kIROp_HLSLStructuredBufferType)) +struct StructuredBuffer { - __target_switch + [__readNone] + [ForceInline] + void GetDimensions( + out uint numStructs, + out uint stride) { - case glsl: __intrinsic_asm "atomicAdd($0, $1)"; - case spirv: - return spirv_asm - { - OpCapability Int64Atomics; - result:$$uint64_t = OpAtomicIAdd &value Device None $amount - }; + let rs = __structuredBufferGetDimensions(this); + numStructs = rs.x; + stride = rs.y; } -} -// Cas - Compare and swap + __intrinsic_op($(kIROp_StructuredBufferLoad)) + [__readNone] + [require(cpp_cuda_glsl_hlsl_spirv, structuredbuffer)] + T Load(TIndex location); -// Helper for HLSL, using NVAPI + __intrinsic_op($(kIROp_StructuredBufferLoadStatus)) + [require(hlsl, structuredbuffer)] + T Load(TIndex location, out uint status); -[__requiresNVAPI] -[require(hlsl, atomic_hlsl_nvapi)] -uint2 __cas(RWByteAddressBuffer buf, uint offset, uint2 compareValue, uint2 value) -{ - __target_switch + __generic + __subscript(TIndex index) -> T { - case hlsl: __intrinsic_asm "NvInterlockedCompareExchangeUint64($0, $1, $2, $3)"; - } -} + [__readNone] + __intrinsic_op($(kIROp_StructuredBufferLoad)) + [require(cpp_cuda_glsl_hlsl_spirv, structuredbuffer)] + get; + }; +}; -// CAS using SM6.6 -[require(hlsl, atomic_hlsl_sm_6_6)] -void __cas(RWByteAddressBuffer buf, uint offset, in int64_t compare_value, in int64_t value, out int64_t original_value) +/** +Represents an opaque handle to a consume structured buffer allocated in global memory. +A structured buffer can be viewed as an array of the specified element type. +An append structure buffer internally maintains an atomic counter to keep track of the number of elements in the buffer, +and provide an atomic operation to append a new element to the buffer. +@param T The element type of the buffer. +@param L The memory layout of the buffer. +@remarks +This type is supported natively when targeting HLSL. +When generating code for other targets, this type is translated into a pair or an ordinary `StructuredBuffer` and +a separate `RWStructuredBuffer` that holds the atomic counter. +The `L` generic parameter is used to specify the memory layout of the buffer when +generating SPIRV. +`L` must be one of `DefaultDataLayout`, `Std140DataLayout`, `Std430DataLayout` or `ScalarDataLayout`. +The default value is `DefaultDataLayout`. +When generating code for other targets, this parameter is ignored and has no effect on the generated code. +@see `StructuredBuffer`, `AppendStructuredBuffer`, `RWStructuredBuffer`, `RasterizerOrderedStructuredBuffer`. +@category buffer_types +*/ +__generic +__magic_type(HLSLConsumeStructuredBufferType) +__intrinsic_type($(kIROp_HLSLConsumeStructuredBufferType)) +[require(cpp_cuda_glsl_hlsl_spirv, consumestructuredbuffer)] +struct ConsumeStructuredBuffer { - __target_switch - { - case hlsl: __intrinsic_asm "$0.InterlockedCompareExchange64($1, $2, $3, $4)"; - } -} + __intrinsic_op($(kIROp_StructuredBufferConsume)) + T Consume(); -[require(hlsl, atomic_hlsl_sm_6_6)] -void __cas(RWByteAddressBuffer buf, uint offset, in uint64_t compare_value, in uint64_t value, out uint64_t original_value) -{ - __target_switch + [ForceInline] + void GetDimensions( + out uint numStructs, + out uint stride) { - case hlsl: __intrinsic_asm "$0.InterlockedCompareExchange64($1, $2, $3, $4)"; + let result = __structuredBufferGetDimensions(this); + numStructs = result.x; + stride = result.y; } -} +}; -__glsl_version(430) -__glsl_extension(GL_EXT_shader_atomic_int64) -[ForceInline] -[require(glsl_spirv, atomic_glsl_int64)] -int64_t __cas(__ref int64_t ioValue, int64_t compareValue, int64_t newValue) +__intrinsic_op($(kIROp_GetElement)) +T __getElement(U collection, I index); + +/// @category stage_io Stage IO types +__generic +[require(glsl_hlsl_spirv, hull)] +__magic_type(HLSLInputPatchType) +__intrinsic_type($(kIROp_HLSLInputPatchType)) +struct InputPatch { - __target_switch + __generic + __subscript(TIndex index)->T { - case glsl: __intrinsic_asm "atomicCompSwap($0, $1, $2)"; - case spirv: - return spirv_asm + [__unsafeForceInlineEarly] + get { - OpCapability Int64Atomics; - result:$$int64_t = OpAtomicCompareExchange &ioValue Device None None $newValue $compareValue - }; + __target_switch + { + case hlsl: + __intrinsic_asm ".operator[]"; + default: + return __getElement(this, index); + } + } } -} +}; -__glsl_version(430) -__glsl_extension(GL_EXT_shader_atomic_int64) -[ForceInline] -[require(glsl_spirv, atomic_glsl_int64)] -uint64_t __cas(__ref uint64_t ioValue, uint64_t compareValue, uint64_t newValue) +/// @category stage_io +__generic +[require(glsl_hlsl_spirv, domain_hull)] +__magic_type(HLSLOutputPatchType) +__intrinsic_type($(kIROp_HLSLOutputPatchType)) +struct OutputPatch { - __target_switch + __generic + __subscript(TIndex index)->T { - case glsl: __intrinsic_asm "atomicCompSwap($0, $1, $2)"; - case spirv: - return spirv_asm + [__unsafeForceInlineEarly] + get { - OpCapability Int64Atomics; - result:$$uint64_t = OpAtomicCompareExchange &ioValue Device None None $newValue $compareValue - }; + __target_switch + { + case hlsl: + __intrinsic_asm ".operator[]"; + default: + return __getElement(this, index); + } + } } -} - -// Max +}; -[__requiresNVAPI] -[require(hlsl, atomic_hlsl_nvapi)] -uint2 __atomicMax(RWByteAddressBuffer buf, uint offset, uint2 value) +${{{{ +static const struct { + IROp op; + char const* name; +} kMutableByteAddressBufferCases[] = { - __target_switch - { - case hlsl: __intrinsic_asm "NvInterlockedMaxUint64($0, $1, $2)"; - } -} + { kIROp_HLSLRWByteAddressBufferType, "RWByteAddressBuffer" }, + { kIROp_HLSLRasterizerOrderedByteAddressBufferType, "RasterizerOrderedByteAddressBuffer" }, +}; +for(auto item : kMutableByteAddressBufferCases) { +}}}} -__glsl_version(430) -__glsl_extension(GL_EXT_shader_atomic_int64) -[ForceInline] -[require(glsl_spirv, atomic_glsl_int64)] -uint64_t __atomicMax(__ref uint64_t ioValue, uint64_t value) +/// @category buffer_types +__magic_type(HLSL$(item.name)Type) +__intrinsic_type($(item.op)) +struct $(item.name) { - __target_switch + // Note(tfoley): supports all operations from `ByteAddressBuffer` + // TODO(tfoley): can this be made a sub-type? + + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, structuredbuffer_rw)] + void GetDimensions(out uint dim) { - case glsl: __intrinsic_asm "atomicMax($0, $1)"; - case spirv: - return spirv_asm + __target_switch { - OpCapability Int64Atomics; - result:$$uint64_t = OpAtomicUMax &ioValue Device None $value - }; + case cpp: __intrinsic_asm ".GetDimensions"; + case cuda: __intrinsic_asm ".GetDimensions"; + case hlsl: __intrinsic_asm ".GetDimensions"; + case glsl: + case spirv: + dim = __structuredBufferGetDimensions(__getEquivalentStructuredBuffer(this)).x*4; + } } -} -__glsl_version(430) -__glsl_extension(GL_EXT_shader_atomic_float2) -[ForceInline] -[require(glsl_spirv, atomic_glsl_float2)] -float __atomicMax(__ref float ioValue, float value) -{ - __target_switch + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, byteaddressbuffer_rw)] + uint Load(int location) { - case glsl: __intrinsic_asm "atomicMax($0, $1)"; - case spirv: - return spirv_asm + __target_switch { - OpExtension "SPV_EXT_shader_atomic_float_min_max"; - OpCapability AtomicFloat32MinMaxEXT; - result:$$float = OpAtomicFMaxEXT &ioValue Device None $value - }; + case hlsl: __intrinsic_asm ".Load"; + default: + return __byteAddressBufferLoad(this, location, 0); + } } -} -__glsl_version(430) -__glsl_extension(GL_EXT_shader_atomic_float2) -[ForceInline] -[require(glsl_spirv, atomic_glsl_float2)] -half __atomicMax(__ref half ioValue, half value) -{ - __target_switch + [__NoSideEffect] + [ForceInline] + [require(hlsl, byteaddressbuffer_rw)] + uint Load(int location, out uint status) { - case glsl: __intrinsic_asm "atomicMax($0, $1)"; - case spirv: - return spirv_asm + __target_switch { - OpExtension "SPV_EXT_shader_atomic_float_min_max"; - OpCapability AtomicFloat16MinMaxEXT; - result:$$half = OpAtomicFMaxEXT &ioValue Device None $value - }; + case hlsl: __intrinsic_asm ".Load"; + } } -} - -// Min -[__requiresNVAPI] -[require(hlsl, atomic_hlsl_nvapi)] -uint2 __atomicMin(RWByteAddressBuffer buf, uint offset, uint2 value) -{ - __target_switch + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + uint2 Load2(int location) { - case hlsl: __intrinsic_asm "NvInterlockedMinUint64($0, $1, $2)"; + __target_switch + { + case hlsl: __intrinsic_asm ".Load2"; + default: + return __byteAddressBufferLoad(this, location, 0); + } } -} -__glsl_version(430) -__glsl_extension(GL_EXT_shader_atomic_int64) -[ForceInline] -[require(glsl_spirv, atomic_glsl_int64)] -uint64_t __atomicMin(__ref uint64_t ioValue, uint64_t value) -{ - __target_switch + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + uint2 Load2(int location, int alignment) { - case glsl: __intrinsic_asm "atomicMin($0, $1)"; - case spirv: - return spirv_asm + __target_switch { - OpCapability Int64Atomics; - result:$$uint64_t = OpAtomicUMin &ioValue Device None $value - }; + case hlsl: __intrinsic_asm ".Load2"; + default: + return __byteAddressBufferLoad(this, location, alignment); + } } -} -__glsl_version(430) -__glsl_extension(GL_EXT_shader_atomic_float2) -[ForceInline] -[require(glsl_spirv, atomic_glsl_float2)] -float __atomicMin(__ref float ioValue, float value) -{ - __target_switch + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + uint2 Load2Aligned(int location) { - case glsl: __intrinsic_asm "atomicMin($0, $1)"; - case spirv: - return spirv_asm + __target_switch { - OpExtension "SPV_EXT_shader_atomic_float_min_max"; - OpCapability AtomicFloat32MinMaxEXT; - result:$$float = OpAtomicFMinEXT &ioValue Device None $value - }; + case hlsl: __intrinsic_asm ".Load2"; + default: + return __byteAddressBufferLoad(this, location, __naturalStrideOf()); + } } -} -__glsl_version(430) -__glsl_extension(GL_EXT_shader_atomic_float2) -[ForceInline] -[require(glsl_spirv, atomic_glsl_float2)] -half __atomicMin(__ref half ioValue, half value) -{ - __target_switch + [__NoSideEffect] + [ForceInline] + [require(hlsl, byteaddressbuffer_rw)] + uint2 Load2(int location, out uint status) { - case glsl: __intrinsic_asm "atomicMin($0, $1)"; - case spirv: - return spirv_asm + __target_switch { - OpExtension "SPV_EXT_shader_atomic_float_min_max"; - OpCapability AtomicFloat16MinMaxEXT; - result:$$half = OpAtomicFMinEXT &ioValue Device None $value - }; + case hlsl: __intrinsic_asm ".Load2"; + } } -} - -// And -[__requiresNVAPI] -[require(hlsl, atomic_hlsl_nvapi)] -uint2 __atomicAnd(RWByteAddressBuffer buf, uint offset, uint2 value) -{ - __target_switch + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + uint3 Load3(int location) { - case hlsl: __intrinsic_asm "NvInterlockedAndUint64($0, $1, $2)"; + __target_switch + { + case hlsl: __intrinsic_asm ".Load3"; + default: + return __byteAddressBufferLoad(this, location, 0); + } } -} -__glsl_version(430) -__glsl_extension(GL_EXT_shader_atomic_int64) -[ForceInline] -[require(glsl_spirv, atomic_glsl_int64)] -uint64_t __atomicAnd(__ref uint64_t ioValue, uint64_t value) -{ - __target_switch + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + uint3 Load3(int location, int alignment) { - case glsl: __intrinsic_asm "atomicAnd($0, $1)"; - case spirv: - return spirv_asm + __target_switch { - OpCapability Int64Atomics; - result:$$uint64_t = OpAtomicAnd &ioValue Device None $value - }; + case hlsl: __intrinsic_asm ".Load3"; + default: + return __byteAddressBufferLoad(this, location, alignment); + } } -} - -// Or -[__requiresNVAPI] -[require(hlsl, atomic_hlsl_nvapi)] -uint2 __atomicOr(RWByteAddressBuffer buf, uint offset, uint2 value) -{ - __target_switch + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + uint3 Load3Aligned(int location) { - case hlsl: __intrinsic_asm "NvInterlockedOrUint64($0, $1, $2)"; + __target_switch + { + case hlsl: __intrinsic_asm ".Load3"; + default: + return __byteAddressBufferLoad(this, location, __naturalStrideOf()); + } } -} -__glsl_version(430) -__glsl_extension(GL_EXT_shader_atomic_int64) -[ForceInline] -[require(glsl_spirv, atomic_glsl_int64)] -uint64_t __atomicOr(__ref uint64_t ioValue, uint64_t value) -{ - __target_switch + [__NoSideEffect] + [ForceInline] + [require(hlsl, byteaddressbuffer_rw)] + uint3 Load3(int location, out uint status) { - case glsl: __intrinsic_asm "atomicOr($0, $1)"; - case spirv: - return spirv_asm + __target_switch { - OpCapability Int64Atomics; - result:$$uint64_t = OpAtomicOr &ioValue Device None $value - }; + case hlsl: __intrinsic_asm ".Load3"; + } } -} - -// Xor -[__requiresNVAPI] -[require(hlsl, atomic_hlsl_nvapi)] -uint2 __atomicXor(RWByteAddressBuffer buf, uint offset, uint2 value) -{ - __target_switch + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + uint4 Load4(int location) { - case hlsl: __intrinsic_asm "NvInterlockedXorUint64($0, $1, $2)"; + __target_switch + { + case hlsl: __intrinsic_asm ".Load4"; + default: + return __byteAddressBufferLoad(this, location, 0); + } } -} -__glsl_version(430) -__glsl_extension(GL_EXT_shader_atomic_int64) -[ForceInline] -[require(glsl_spirv, atomic_glsl_int64)] -uint64_t __atomicXor(__ref uint64_t ioValue, uint64_t value) -{ - __target_switch + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + uint4 Load4(int location, int alignment) { - case glsl: __intrinsic_asm "atomicXor($0, $1)"; - case spirv: - return spirv_asm + __target_switch { - OpCapability Int64Atomics; - result:$$uint64_t = OpAtomicXor &ioValue Device None $value - }; + case hlsl: __intrinsic_asm ".Load4"; + default: + return __byteAddressBufferLoad(this, location, alignment); + } } -} - -// Exchange -[__requiresNVAPI] -[require(hlsl, atomic_hlsl_nvapi)] -uint2 __atomicExchange(RWByteAddressBuffer buf, uint offset, uint2 value) -{ - __target_switch + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + uint4 Load4Aligned(int location) { - case hlsl: __intrinsic_asm "NvInterlockedExchangeUint64($0, $1, $2)"; + __target_switch + { + case hlsl: __intrinsic_asm ".Load4"; + default: + return __byteAddressBufferLoad(this, location, __naturalStrideOf()); + } } -} -__glsl_version(430) -__glsl_extension(GL_EXT_shader_atomic_int64) -[ForceInline] -[require(glsl_spirv, atomic_glsl_int64)] -uint64_t __atomicExchange(__ref uint64_t ioValue, uint64_t value) -{ - __target_switch + [__NoSideEffect] + [ForceInline] + [require(hlsl, byteaddressbuffer_rw)] + uint4 Load4(int location, out uint status) { - case glsl: __intrinsic_asm "atomicExchange($0, $1)"; - case spirv: - return spirv_asm + __target_switch { - OpCapability Int64Atomics; - result:$$uint64_t = OpAtomicExchange &ioValue Device None $value - }; + case hlsl: __intrinsic_asm ".Load4"; + } } -} -// Conversion between uint64_t and uint2 + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + T Load(int location) + { + return __byteAddressBufferLoad(this, location, 0); + } -[require(cpp_cuda_glsl_hlsl_spirv, sm_4_0_version)] -uint2 __asuint2(uint64_t i) -{ - return uint2(uint(i), uint(uint64_t(i) >> 32)); -} - -[require(cpp_cuda_glsl_hlsl_spirv, sm_4_0_version)] -uint64_t __asuint64(uint2 i) -{ - return (uint64_t(i.y) << 32) | i.x; -} - -// - -__intrinsic_op($(kIROp_ByteAddressBufferLoad)) -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, byteaddressbuffer)] -T __byteAddressBufferLoad(ByteAddressBuffer buffer, int offset, int alignment); - -__intrinsic_op($(kIROp_ByteAddressBufferLoad)) -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, byteaddressbuffer_rw)] -T __byteAddressBufferLoad(RWByteAddressBuffer buffer, int offset, int alignment); - -__intrinsic_op($(kIROp_ByteAddressBufferLoad)) -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, byteaddressbuffer_rw)] -T __byteAddressBufferLoad(RasterizerOrderedByteAddressBuffer buffer, int offset, int alignment); - -__intrinsic_op($(kIROp_ByteAddressBufferStore)) -[require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] -void __byteAddressBufferStore(RWByteAddressBuffer buffer, int offset, int alignment, T value); - -__intrinsic_op($(kIROp_ByteAddressBufferStore)) -[require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] -void __byteAddressBufferStore(RasterizerOrderedByteAddressBuffer buffer, int offset, int alignment, T value); - -/** -Represents an opaque handle to a read-only structured buffer allocated in global memory. -A structured buffer can be viewed as an array of the specified element type. -@param T The element type of the buffer. -@param L The memory layout of the buffer. -@remarks -The `L` generic parameter is used to specify the memory layout of the buffer when -generating SPIRV. -`L` must be one of `DefaultDataLayout`, `Std140DataLayout`, `Std430DataLayout` or `ScalarDataLayout`. -The default value is `DefaultDataLayout`. -When generating code for other targets, this parameter is ignored and has no effect on the generated code. -@see `RWStructuredBuffer`, `AppendStructuredBuffer`, `ConsumeStructuredBuffer`, `RasterizerOrderedStructuredBuffer`. -@category buffer_types Buffer types -**/ -__generic -__magic_type(HLSLStructuredBufferType) -__intrinsic_type($(kIROp_HLSLStructuredBufferType)) -struct StructuredBuffer -{ - [__readNone] + [__NoSideEffect] [ForceInline] - void GetDimensions( - out uint numStructs, - out uint stride) + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + T Load(int location, int alignment) { - let rs = __structuredBufferGetDimensions(this); - numStructs = rs.x; - stride = rs.y; + return __byteAddressBufferLoad(this, location, alignment); } - __intrinsic_op($(kIROp_StructuredBufferLoad)) - [__readNone] - [require(cpp_cuda_glsl_hlsl_spirv, structuredbuffer)] - T Load(TIndex location); - - __intrinsic_op($(kIROp_StructuredBufferLoadStatus)) - [require(hlsl, structuredbuffer)] - T Load(TIndex location, out uint status); + [__NoSideEffect] + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + T LoadAligned(int location) + { + return __byteAddressBufferLoad(this, location, __naturalStrideOf()); + } - __generic - __subscript(TIndex index) -> T +${{{{ + struct BufferAtomicOps { - [__readNone] - __intrinsic_op($(kIROp_StructuredBufferLoad)) - [require(cpp_cuda_glsl_hlsl_spirv, structuredbuffer)] - get; + const char* name; + const char* internalName; }; -}; - -/** -Represents an opaque handle to a consume structured buffer allocated in global memory. -A structured buffer can be viewed as an array of the specified element type. -An append structure buffer internally maintains an atomic counter to keep track of the number of elements in the buffer, -and provide an atomic operation to append a new element to the buffer. -@param T The element type of the buffer. -@param L The memory layout of the buffer. -@remarks -This type is supported natively when targeting HLSL. -When generating code for other targets, this type is translated into a pair or an ordinary `StructuredBuffer` and -a separate `RWStructuredBuffer` that holds the atomic counter. -The `L` generic parameter is used to specify the memory layout of the buffer when -generating SPIRV. -`L` must be one of `DefaultDataLayout`, `Std140DataLayout`, `Std430DataLayout` or `ScalarDataLayout`. -The default value is `DefaultDataLayout`. -When generating code for other targets, this parameter is ignored and has no effect on the generated code. -@see `StructuredBuffer`, `AppendStructuredBuffer`, `RWStructuredBuffer`, `RasterizerOrderedStructuredBuffer`. -@category buffer_types -*/ -__generic -__magic_type(HLSLConsumeStructuredBufferType) -__intrinsic_type($(kIROp_HLSLConsumeStructuredBufferType)) -[require(cpp_cuda_glsl_hlsl_spirv, consumestructuredbuffer)] -struct ConsumeStructuredBuffer -{ - __intrinsic_op($(kIROp_StructuredBufferConsume)) - T Consume(); - - [ForceInline] - void GetDimensions( - out uint numStructs, - out uint stride) + const BufferAtomicOps bufferAtomicOps[] = { + {"Max", "max"}, + {"Min", "min"}, + {"Add", "add"}, + {"And", "and"}, + {"Or", "or"}, + {"Xor", "xor"}, + {"Exchange", "exchange"} + }; + if (item.op == kIROp_HLSLRWByteAddressBufferType) { - let result = __structuredBufferGetDimensions(this); - numStructs = result.x; - stride = result.y; - } -}; +}}}} -__intrinsic_op($(kIROp_GetElement)) -T __getElement(U collection, I index); + // float32 and int64 atomic support. This is a Slang specific extension, it uses + // GL_EXT_shader_atomic_float on Vulkan + // NvAPI support on DX + // NOTE! To use this feature on HLSL based targets the path to 'nvHLSLExtns.h' from the NvAPI SDK must + // be set. That this include will be added to the *output* that is passed to a downstram compiler. + // Also note that you *can* include NVAPI headers in your Slang source, and directly use NVAPI functions + // Directly using NVAPI functions does *not* add the #include on the output + // Finally note you can *mix* NVAPI direct calls, and use of NVAPI intrinsics below. This doesn't cause + // any clashes, as Slang will emit any NVAPI function it parsed (say via a include in Slang source) with + // unique functions. + // + // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#VK_EXT_shader_atomic_float + // https://htmlpreview.github.io/?https://github.com/KhronosGroup/SPIRV-Registry/blob/master/extensions/EXT/SPV_EXT_shader_atomic_float_add.html -/// @category stage_io Stage IO types -__generic -[require(glsl_hlsl_spirv, hull)] -__magic_type(HLSLInputPatchType) -__intrinsic_type($(kIROp_HLSLInputPatchType)) -struct InputPatch -{ - __generic - __subscript(TIndex index)->T - { - [__unsafeForceInlineEarly] - get - { - __target_switch - { - case hlsl: - __intrinsic_asm ".operator[]"; - default: - return __getElement(this, index); - } - } - } -}; + // F32 Add -/// @category stage_io -__generic -[require(glsl_hlsl_spirv, domain_hull)] -__magic_type(HLSLOutputPatchType) -__intrinsic_type($(kIROp_HLSLOutputPatchType)) -struct OutputPatch -{ - __generic - __subscript(TIndex index)->T + /// Perform a 32-bit floating point atomic add operation at `byteAddress`. + /// @param byteAddress The address at which to perform the atomic add operation. + /// @param valueToAdd The value to add to the value at `byteAddress`. + /// @param originalValue The original value at `byteAddress` before the add operation. + /// @remarks For SPIR-V, this function maps to `OpAtomicFAdd`. For HLSL, this function translates to an NVAPI call + /// due to lack of native HLSL intrinsic for floating point atomic add. For CUDA, this function + /// maps to `atomicAdd`. + __cuda_sm_version(2.0) + [__requiresNVAPI] + [ForceInline] + [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_nvapi_cuda_metal_float1)] + void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue) { - [__unsafeForceInlineEarly] - get + __target_switch { - __target_switch + case hlsl: __intrinsic_asm "($3 = NvInterlockedAddFp32($0, $1, $2))"; + case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt($1), $2))"; + default: { - case hlsl: - __intrinsic_asm ".operator[]"; - default: - return __getElement(this, index); + let buf = __getEquivalentStructuredBuffer(this); + originalValue = __atomic_add(buf[byteAddress / 4], valueToAdd); + return; } } } -}; -${{{{ -static const struct { - IROp op; - char const* name; -} kMutableByteAddressBufferCases[] = -{ - { kIROp_HLSLRWByteAddressBufferType, "RWByteAddressBuffer" }, - { kIROp_HLSLRasterizerOrderedByteAddressBufferType, "RasterizerOrderedByteAddressBuffer" }, -}; -for(auto item : kMutableByteAddressBufferCases) { -}}}} - -/// @category buffer_types -__magic_type(HLSL$(item.name)Type) -__intrinsic_type($(item.op)) -struct $(item.name) -{ - // Note(tfoley): supports all operations from `ByteAddressBuffer` - // TODO(tfoley): can this be made a sub-type? + // FP16x2 + /// @internal + /// Maps to the `NvInterlockedAddFp16x2` NVAPI function. + /// + [__requiresNVAPI] [ForceInline] - [require(cpp_cuda_glsl_hlsl_spirv, structuredbuffer_rw)] - void GetDimensions(out uint dim) + [require(cuda_hlsl_spirv)] + uint _NvInterlockedAddFp16x2(uint byteAddress, uint fp16x2Value) { __target_switch { - case cpp: __intrinsic_asm ".GetDimensions"; - case cuda: __intrinsic_asm ".GetDimensions"; - case hlsl: __intrinsic_asm ".GetDimensions"; - case glsl: - case spirv: - dim = __structuredBufferGetDimensions(__getEquivalentStructuredBuffer(this)).x*4; + case hlsl: + __intrinsic_asm "NvInterlockedAddFp16x2($0, $1, $2)"; + default: + let buf = __getEquivalentStructuredBuffer(this); + return bit_cast(__atomic_add(buf[byteAddress / 4], bit_cast(fp16x2Value))); } } - [__NoSideEffect] + + /// Perform a 16-bit floating point atomic add operation at `byteAddress`. + /// @param byteAddress The address at which to perform the atomic add operation. + /// @param valueToAdd The value to add to the value at `byteAddress`. + /// @param originalValue The original value at `byteAddress` before the add operation. + /// @remarks For SPIR-V, this function maps to `OpAtomicFAdd` and requires `SPV_EXT_shader_atomic_float16_add` extension. + /// + /// For HLSL, this function translates to an NVAPI call + /// due to lack of native HLSL intrinsic for floating point atomic add. For CUDA, this function + /// maps to `atomicAdd`. + [__requiresNVAPI] [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, byteaddressbuffer_rw)] - uint Load(int location) + void InterlockedAddF16(uint byteAddress, half value, out half originalValue) { __target_switch { - case hlsl: __intrinsic_asm ".Load"; + case hlsl: + if ((byteAddress & 2) == 0) + { + uint packedInput = asuint16(value); + originalValue = asfloat16((uint16_t)_NvInterlockedAddFp16x2(byteAddress, packedInput)); + } + else + { + byteAddress = byteAddress & ~3; + uint packedInput = ((uint)asuint16(value)) << 16; + originalValue = asfloat16((uint16_t)(_NvInterlockedAddFp16x2(byteAddress, packedInput) >> 16)); + } + return; default: - return __byteAddressBufferLoad(this, location, 0); + { + let buf = __getEquivalentStructuredBuffer(this); + originalValue = __atomic_add(buf[byteAddress/2], value); + return; + } } } - [__NoSideEffect] + /// Perform a 16-bit floating point atomic add operation at `byteAddress` through emulation using `half2` atomics. + /// @param byteAddress The address at which to perform the atomic add operation. + /// @param valueToAdd The value to add to the value at `byteAddress`. + /// @param originalValue The original value at `byteAddress` before the add operation. + /// @remarks For SPIR-V, this function maps to `OpAtomicFAdd` on a `half2` vector with the correct part set to `value` + /// and the remaining part set to 0. This requires the `AtomicFloat16VectorNV` capability introduced by the `SPV_NV_shader_atomic_fp16_vector` + /// extension. + /// + /// For HLSL, this function translates to an equivalent NVAPI call + /// due to lack of native HLSL intrinsic for floating point atomic add. For CUDA, this function + /// maps to `atomicAdd`. + [__requiresNVAPI] [ForceInline] - [require(hlsl, byteaddressbuffer_rw)] - uint Load(int location, out uint status) + void InterlockedAddF16Emulated(uint byteAddress, half value, out half originalValue) { __target_switch { - case hlsl: __intrinsic_asm ".Load"; + case hlsl: + if ((byteAddress & 2) == 0) + { + uint packedInput = asuint16(value); + originalValue = asfloat16((uint16_t)_NvInterlockedAddFp16x2(byteAddress, packedInput)); + } + else + { + byteAddress = byteAddress & ~3; + uint packedInput = ((uint)asuint16(value)) << 16; + originalValue = asfloat16((uint16_t)(_NvInterlockedAddFp16x2(byteAddress, packedInput) >> 16)); + } + return; + default: + { + let buf = __getEquivalentStructuredBuffer(this); + if ((byteAddress & 2) == 0) + { + originalValue = __atomic_add(buf[byteAddress/4], half2(value, half(0.0))).x; + } + else + { + originalValue = __atomic_add(buf[byteAddress/4], half2(half(0.0), value)).y; + } + return; + } } } - [__NoSideEffect] + // Without returning original value + + [__requiresNVAPI] [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - uint2 Load2(int location) + __cuda_sm_version(2.0) + [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_nvapi_cuda_metal_float1)] + void InterlockedAddF32(uint byteAddress, float valueToAdd) { __target_switch { - case hlsl: __intrinsic_asm ".Load2"; + case hlsl: __intrinsic_asm "(NvInterlockedAddFp32($0, $1, $2))"; default: - return __byteAddressBufferLoad(this, location, 0); + { + let buf = __getEquivalentStructuredBuffer(this); + __atomic_add(buf[byteAddress / 4], valueToAdd); + return; + } } } - [__NoSideEffect] + // Int64 Add + + /// Perform a 64-bit integer atomic add operation at `byteAddress`. + /// @param byteAddress The address at which to perform the atomic add operation. + /// @param valueToAdd The value to add to the value at `byteAddress`. + /// @param originalValue The original value at `byteAddress` before the add operation. + /// @remarks For SPIR-V, this function maps to `OpAtomicAdd`. For HLSL, this function + /// translates to `InterlockedAdd64` and requires shader model 6.6. + /// For CUDA, this function maps to `atomicAdd`. [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - uint2 Load2(int location, int alignment) + [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] + void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue) + { + InterlockedAdd64(byteAddress, valueToAdd, originalValue); + } + + // Without returning original value + [ForceInline] + [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] + void InterlockedAddI64(uint byteAddress, int64_t valueToAdd) + { + InterlockedAdd64(byteAddress, valueToAdd); + } + + // Cas uint64_t + + /// Perform a 64-bit integer atomic compare-and-exchange operation at `byteAddress`. + /// @param byteAddress The address at which to perform the atomic compare-and-exchange operation. + /// @param compareValue The value to compare to the value at `byteAddress`. + /// @param value The value to store at `byteAddress` if the comparison is successful. + /// @param originalValue The original value at `byteAddress` before the add operation. + /// @remarks For SPIR-V, this function maps to `OpAtomicCompareExchange`. For HLSL, this function + /// translates to `InterlockedCompareExchange64` and requires shader model 6.6. + /// For CUDA, this function maps to `atomicCAS`. + [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] + void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue) { __target_switch { - case hlsl: __intrinsic_asm ".Load2"; + case cuda: __intrinsic_asm "(*$4 = atomicCAS($0._getPtrAt($1), $2, $3))"; + case hlsl: + __intrinsic_asm ".InterlockedCompareExchange64"; default: - return __byteAddressBufferLoad(this, location, alignment); + let buf = __getEquivalentStructuredBuffer(this); + outOriginalValue = __atomic_compare_exchange(buf[byteAddress / 8], compareValue, value); } } - [__NoSideEffect] + // SM6.6 6 64bit atomics. + + // InterlockedMax64, InterlockedMin64, InterlockedAdd64, InterlockedAnd64, InterlockedOr64, InterlockedXor64, InterlockedExchange64 +${{{{ + for (auto op : bufferAtomicOps) { +}}}} [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - uint2 Load2Aligned(int location) + [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] + uint64_t Interlocked$(op.name)U64(uint byteAddress, uint64_t value) + { + uint64_t originalValue; + Interlocked$(op.name)64(byteAddress, value, originalValue); + return originalValue; + } + + [ForceInline] + [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] + void Interlocked$(op.name)64(uint byteAddress, int64_t value) + { + int64_t oldValue; + Interlocked$(op.name)64(byteAddress, value, oldValue); + } + + /// Perform a 64-bit integer atomic $(op.internalName) operation at `byteAddress`. + /// @param byteAddress The address at which to perform the atomic $(op.internalName) operation. + /// @param value The operand for the $(op.internalName) operation. + /// @param originalValue The original value at `byteAddress` before the $(op.internalName) operation. + [ForceInline] + [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] + void Interlocked$(op.name)64(uint byteAddress, T value, out T outOriginalValue) { __target_switch { - case hlsl: __intrinsic_asm ".Load2"; + case hlsl: __intrinsic_asm ".Interlocked$(op.name)64"; default: - return __byteAddressBufferLoad(this, location, __naturalStrideOf()); + let buf = __getEquivalentStructuredBuffer(this); + outOriginalValue = __atomic_$(op.internalName)(buf[byteAddress / 8], value); + return; } } +${{{{ +} // for (each bufferOps) +}}}} - [__NoSideEffect] + /// Perform a 64-bit integer atomic compare-and-exchange operation at `byteAddress`. + /// @param byteAddress The address at which to perform the atomic compare-and-exchange operation. + /// @param compareValue The value to compare to the value at `byteAddress`. + /// @param value The value to store at `byteAddress` if the comparison is successful. + /// @param outOriginalValue The original value at `byteAddress` before the add operation. + /// @remarks For SPIR-V, this function maps to `OpAtomicCompareExchange`. For HLSL, this function + /// translates to `InterlockedCompareExchange64` and requires shader model 6.6. + /// For CUDA, this function maps to `atomicCAS`. [ForceInline] - [require(hlsl, byteaddressbuffer_rw)] - uint2 Load2(int location, out uint status) + [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] + void InterlockedCompareExchange64(uint byteAddress, T compareValue, T value, out T outOriginalValue) { __target_switch { - case hlsl: __intrinsic_asm ".Load2"; + case hlsl: + __intrinsic_asm ".InterlockedCompareExchange64"; + default: + let buf = __getEquivalentStructuredBuffer(this); + outOriginalValue = __atomic_compare_exchange(buf[byteAddress / 8], compareValue, value); + return; } } - [__NoSideEffect] [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - uint3 Load3(int location) + [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] + void InterlockedCompareExchangeFloatBitwise(uint byteAddress, float compareValue, float value, out float outOriginalValue) { __target_switch { - case hlsl: __intrinsic_asm ".Load3"; + case hlsl: __intrinsic_asm ".InterlockedCompareExchangeFloatBitwise"; default: - return __byteAddressBufferLoad(this, location, 0); + let buf = __getEquivalentStructuredBuffer(this); + outOriginalValue = __atomic_compare_exchange(buf[byteAddress / 4], compareValue, value); + return; } } - [__NoSideEffect] + /// Perform a floating-point atomic bitwise exchange operation at `byteAddress`. + /// @param byteAddress The address at which to perform the atomic exchange operation. + /// @param value The value to store at `byteAddress`. + /// @param [out] outOriginalValue The original value at `byteAddress` before the exchange operation. + /// @remarks For SPIR-V, this function maps to `OpAtomicExchange`. For HLSL, this function + /// translates to `InterlockedExchangeFloat` and requires shader model 6.6. + /// For CUDA, this function maps to `atomicExch`. [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - uint3 Load3(int location, int alignment) + [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] + void InterlockedExchangeFloat(uint byteAddress, float value, out float outOriginalValue) { __target_switch { - case hlsl: __intrinsic_asm ".Load3"; + case hlsl: __intrinsic_asm ".InterlockedExchangeFloat"; default: - return __byteAddressBufferLoad(this, location, alignment); + let buf = __getEquivalentStructuredBuffer(this); + outOriginalValue = __atomic_exchange(buf[byteAddress / 4], value); + return; } } - [__NoSideEffect] + /// Perform a 64-bit integer atomic compare-and-store operation at `byteAddress`. + /// @param byteAddress The address at which to perform the atomic store operation. + /// @param compareValue The value to compare to the value at `byteAddress`. + /// @param value The value to store at `byteAddress` if the the value at address is equal to `compareValue`. + /// @param [out] outOriginalValue The original value at `byteAddress` before the store operation. + /// @remarks For SPIR-V, this function maps to `OpAtomicCompareExchange`. For HLSL, this function + /// translates to `InterlockedCompareStore64` and requires shader model 6.6. + /// For CUDA, this function maps to `atomicCAS`. [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - uint3 Load3Aligned(int location) + [ForceInline] + [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] + void InterlockedCompareStore64(uint byteAddress, T compareValue, T value) { __target_switch { - case hlsl: __intrinsic_asm ".Load3"; + case hlsl: __intrinsic_asm ".InterlockedCompareStore64"; default: - return __byteAddressBufferLoad(this, location, __naturalStrideOf()); + let buf = __getEquivalentStructuredBuffer(this); + __atomic_compare_exchange(buf[byteAddress / 4], compareValue, value); + return; } } - - [__NoSideEffect] + + /// Perform a floating-point atomic bitwise compare-and-store operation at `byteAddress`. + /// @param byteAddress The address at which to perform the atomic compare-and-exchange operation. + /// @param compareValue The value to perform bitwise comparison to the value at `byteAddress`. + /// @param value The value to store at `byteAddress` if the comparison is successful. + /// @param [out] outOriginalValue The original value at `byteAddress` before the compare-and-exchange operation. + /// @remarks For SPIR-V, this function maps to `OpAtomicCompareExchange`. For HLSL, this function + /// translates to `InterlockedCompareStoreFloatBitwise` and requires shader model 6.6. + /// For CUDA, this function maps to `atomicCAS`. [ForceInline] - [require(hlsl, byteaddressbuffer_rw)] - uint3 Load3(int location, out uint status) + [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] + void InterlockedCompareStoreFloatBitwise(uint byteAddress, float compareValue, float value) { __target_switch { - case hlsl: __intrinsic_asm ".Load3"; + case hlsl: __intrinsic_asm ".InterlockedCompareStoreFloatBitwise"; + default: + let buf = __getEquivalentStructuredBuffer(this); + __atomic_compare_exchange(buf[byteAddress / 4], compareValue, value); + return; } } - [__NoSideEffect] +${{{{ + } // endif (type == RWByteAddressBuffer) +}}}} + + // 32-bit atomic operations: + // InterlockedMax, InterlockedMin, InterlockedAdd, InterlockedAnd, InterlockedOr, InterlockedXor, InterlockedExchange +${{{{ + for (auto op : bufferAtomicOps) { +}}}} + + /// Perform an atomic $(op.internalName) operation at the specified byte + /// location of the byte address buffer. + /// @param dest The byte address at which to perform the atomic $(op.internalName) operation. + /// @param value The operand of the atomic operation. + /// @param original_value The original value at `dest` before the $(op.internalName) operation. [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - uint4 Load4(int location) + [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal, byteaddressbuffer_rw)] + void Interlocked$(op.name)( + UINT dest, + UINT value, + out UINT original_value) { __target_switch { - case hlsl: __intrinsic_asm ".Load4"; + case hlsl: __intrinsic_asm ".Interlocked$(op.name)"; default: - return __byteAddressBufferLoad(this, location, 0); + let buf = __getEquivalentStructuredBuffer(this); + ::Interlocked$(op.name)(buf[dest / 4], value, original_value); } } - [__NoSideEffect] [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - uint4 Load4(int location, int alignment) + [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal, byteaddressbuffer_rw)] + void Interlocked$(op.name)( + UINT dest, + UINT value) { __target_switch { - case hlsl: __intrinsic_asm ".Load4"; + case hlsl: __intrinsic_asm ".Interlocked$(op.name)"; default: - return __byteAddressBufferLoad(this, location, alignment); + let buf = __getEquivalentStructuredBuffer(this); + ::Interlocked$(op.name)(buf[dest / 4], value); } } +${{{{ +} // for (buffer atomic ops) +}}}} - [__NoSideEffect] + /// Perform a 32-bit integer atomic compare-and-exchange operation at + /// the specified byte address within the `RWByteAddressBuffer`. + /// @param dest The address at which to perform the atomic compare-and-exchange operation. + /// @param compare_value The value to perform bitwise comparison to the value at `byteAddress`. + /// @param value The value to store at `byteAddress` if the comparison is successful. + /// @param original_value The original value at `byteAddress` before the compare-and-exchange operation. + /// @remarks For SPIR-V, this function maps to `OpAtomicCompareExchange`. For HLSL, this function + /// translates to `InterlockedCompareExchange`. + /// For CUDA, this function maps to `atomicCAS`. [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - uint4 Load4Aligned(int location) + [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal, byteaddressbuffer_rw)] + void InterlockedCompareExchange( + UINT dest, + UINT compare_value, + UINT value, + out UINT original_value) { __target_switch { - case hlsl: __intrinsic_asm ".Load4"; + case hlsl: __intrinsic_asm ".InterlockedCompareExchange"; default: - return __byteAddressBufferLoad(this, location, __naturalStrideOf()); + let buf = __getEquivalentStructuredBuffer(this); + ::InterlockedCompareExchange(buf[dest / 4], compare_value, value, original_value); } } - [__NoSideEffect] + /// Perform a 32-bit integer atomic compare-and-store operation at + /// the specified byte address within the `RWByteAddressBuffer`. + /// @param dest The address at which to perform the atomic add operation. + /// @param compare_value The value to perform comparison to the value at `byteAddress`. + /// @param value The value to store at `byteAddress` if the comparison is successful. + /// @remarks For SPIR-V, this function maps to `OpAtomicCompareExchange`. For HLSL, this function + /// translates to `InterlockedCompareStore`. + /// For CUDA, this function maps to `atomicCAS`. [ForceInline] - [require(hlsl, byteaddressbuffer_rw)] - uint4 Load4(int location, out uint status) + [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal, byteaddressbuffer_rw)] + void InterlockedCompareStore( + UINT dest, + UINT compare_value, + UINT value) { __target_switch { - case hlsl: __intrinsic_asm ".Load4"; + case hlsl: __intrinsic_asm ".InterlockedCompareStore"; + default: + let buf = __getEquivalentStructuredBuffer(this); + ::InterlockedCompareStore(buf[dest / 4], compare_value, value); } } - [__NoSideEffect] - [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - T Load(int location) - { - return __byteAddressBufferLoad(this, location, 0); - } - [__NoSideEffect] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - T Load(int location, int alignment) + void Store(uint address, uint value) { - return __byteAddressBufferLoad(this, location, alignment); + __target_switch + { + case hlsl: __intrinsic_asm ".Store"; + default: + __byteAddressBufferStore(this, address, 0, value); + } } - [__NoSideEffect] + [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - T LoadAligned(int location) + void Store2(uint address, uint2 value) { - return __byteAddressBufferLoad(this, location, __naturalStrideOf()); + __target_switch + { + case hlsl: __intrinsic_asm ".Store2"; + default: + __byteAddressBufferStore(this, address, 0, value); + } } -${{{{ - if (item.op == kIROp_HLSLRWByteAddressBufferType) - { -}}}} - - // float32 and int64 atomic support. This is a Slang specific extension, it uses - // GL_EXT_shader_atomic_float on Vulkan - // NvAPI support on DX - // NOTE! To use this feature on HLSL based targets the path to 'nvHLSLExtns.h' from the NvAPI SDK must - // be set. That this include will be added to the *output* that is passed to a downstram compiler. - // Also note that you *can* include NVAPI headers in your Slang source, and directly use NVAPI functions - // Directly using NVAPI functions does *not* add the #include on the output - // Finally note you can *mix* NVAPI direct calls, and use of NVAPI intrinsics below. This doesn't cause - // any clashes, as Slang will emit any NVAPI function it parsed (say via a include in Slang source) with - // unique functions. - // - // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#VK_EXT_shader_atomic_float - // https://htmlpreview.github.io/?https://github.com/KhronosGroup/SPIRV-Registry/blob/master/extensions/EXT/SPV_EXT_shader_atomic_float_add.html - - // F32 Add - __cuda_sm_version(2.0) - [__requiresNVAPI] [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_nvapi_cuda_metal_float1)] - void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue) + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + void Store2(uint address, uint2 value, uint alignment) { __target_switch { - case hlsl: __intrinsic_asm "($3 = NvInterlockedAddFp32($0, $1, $2))"; - case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt($1), $2))"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_add(__getMetalAtomicRef(buf[byteAddress / 4]), valueToAdd, originalValue); - return; - } - case glsl: - case spirv: - { - let buf = __getEquivalentStructuredBuffer(this); - originalValue = __atomicAdd(buf[byteAddress / 4], valueToAdd); - return; - } + case hlsl: __intrinsic_asm ".Store2"; + default: + __byteAddressBufferStore(this, address, alignment, value); } } - // FP16x2 - [__requiresNVAPI] [ForceInline] - [require(hlsl, atomic_hlsl_nvapi)] - uint _NvInterlockedAddFp16x2(uint byteAddress, uint fp16x2Value) + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + void Store2Aligned(uint address, uint2 value) { __target_switch { - case hlsl: - __intrinsic_asm "NvInterlockedAddFp16x2($0, $1, $2)"; + case hlsl: __intrinsic_asm ".Store2"; + default: + __byteAddressBufferStore(this, address, __naturalStrideOf(), value); } } - [__requiresNVAPI] [ForceInline] - void InterlockedAddF16(uint byteAddress, half value, out half originalValue) + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + void Store3(uint address, uint3 value) { __target_switch { - case hlsl: - if ((byteAddress & 2) == 0) - { - uint packedInput = asuint16(value); - originalValue = asfloat16((uint16_t)_NvInterlockedAddFp16x2(byteAddress, packedInput)); - } - else - { - byteAddress = byteAddress & ~3; - uint packedInput = ((uint)asuint16(value)) << 16; - originalValue = asfloat16((uint16_t)(_NvInterlockedAddFp16x2(byteAddress, packedInput) >> 16)); - } - return; - case glsl: - case spirv: - { - let buf = __getEquivalentStructuredBuffer(this); - if ((byteAddress & 2) == 0) - { - originalValue = __atomicAdd(buf[byteAddress/4], half2(value, half(0.0))).x; - } - else - { - originalValue = __atomicAdd(buf[byteAddress/4], half2(half(0.0), value)).y; - } - return; - } + case hlsl: __intrinsic_asm ".Store3"; + default: + __byteAddressBufferStore(this, address, 0, value); } } - // Without returning original value - [__requiresNVAPI] [ForceInline] - __cuda_sm_version(2.0) - [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_nvapi_cuda_metal_float1)] - void InterlockedAddF32(uint byteAddress, float valueToAdd) + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + void Store3(uint address, uint3 value, uint alignment) { __target_switch { - case hlsl: __intrinsic_asm "(NvInterlockedAddFp32($0, $1, $2))"; - case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt($1), $2)"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_add(__getMetalAtomicRef(buf[byteAddress / 4]), valueToAdd); - return; - } - case glsl: - case spirv: - { - let buf = __getEquivalentStructuredBuffer(this); - __atomicAdd(buf[byteAddress / 4], valueToAdd); - return; - } + case hlsl: __intrinsic_asm ".Store3"; + default: + __byteAddressBufferStore(this, address, alignment, value); } } - // Int64 Add [ForceInline] - __cuda_sm_version(6.0) - [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda6_int64)] - void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue) + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + void Store3Aligned(uint address, uint3 value) { __target_switch { - case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt($1), $2))"; - case hlsl: - originalValue = __asuint64(__atomicAdd(this, byteAddress, __asuint2(valueToAdd))); - case glsl: - case spirv: - { - let buf = __getEquivalentStructuredBuffer(this); - originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd); - } + case hlsl: __intrinsic_asm ".Store3"; + default: + __byteAddressBufferStore(this, address, __naturalStrideOf(), value); } } - // Without returning original value - __cuda_sm_version(6.0) - [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda6_int64)] - void InterlockedAddI64(uint byteAddress, int64_t valueToAdd) + [ForceInline] + [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + void Store4(uint address, uint4 value) { __target_switch { - case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt($1), $2)"; - case hlsl: - __atomicAdd(this, byteAddress, __asuint2(valueToAdd)); - case glsl: - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - __atomicAdd(buf[byteAddress / 8], valueToAdd); + case hlsl: __intrinsic_asm ".Store4"; + default: + __byteAddressBufferStore(this, address, 0, value); } } - // Cas uint64_t - [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda9_int64)] - void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue) + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + void Store4(uint address, uint4 value, uint alignment) { __target_switch { - case cuda: __intrinsic_asm "(*$4 = atomicCAS($0._getPtrAt($1), $2, $3))"; - case hlsl: - outOriginalValue = __asuint64(__cas(this, byteAddress, __asuint2(compareValue), __asuint2(value))); - case glsl: - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value); + case hlsl: __intrinsic_asm ".Store4"; + default: + __byteAddressBufferStore(this, address, alignment, value); } } - // Max - - __cuda_sm_version(5.0) - [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda5_int64)] - uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value) + [ForceInline] + [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] + void Store4Aligned(uint address, uint4 value) { __target_switch { - case cuda: __intrinsic_asm "atomicMax($0._getPtrAt($1), $2)"; - case hlsl: - return __asuint64(__atomicMax(this, byteAddress, __asuint2(value))); - case glsl: - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - return __atomicMax(buf[byteAddress / 8], value); + case hlsl: __intrinsic_asm ".Store4"; + default: + __byteAddressBufferStore(this, address, __naturalStrideOf(), value); } } [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedMax64(uint byteAddress, int64_t value) + void Store(int offset, T value) { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedMax64"; - } + __byteAddressBufferStore(this, offset, 0, value); } [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedMax64(uint byteAddress, int64_t value, out int64_t outOriginalValue) + void Store(int offset, T value, uint alignment) { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedMax64"; - } + __byteAddressBufferStore(this, offset, alignment, value); } [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedMax64(uint byteAddress, uint64_t value) + void StoreAligned(int offset, T value) { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedMax64"; - } + __byteAddressBufferStore(this, offset, __naturalStrideOf(), value); } +}; + +${{{{ +} +}}}} + +${{{{ +static const struct { + IROp op; + char const* name; +} kMutableStructuredBufferCases[] = +{ + { kIROp_HLSLRWStructuredBufferType, "RWStructuredBuffer" }, + { kIROp_HLSLRasterizerOrderedStructuredBufferType, "RasterizerOrderedStructuredBuffer" }, +}; +for(auto item : kMutableStructuredBufferCases) { +}}}} + +__generic +__magic_type(HLSL$(item.name)Type) +__intrinsic_type($(item.op)) +[require(cpp_cuda_glsl_hlsl_metal_spirv, structuredbuffer_rw)] +/** +Represents an opaque handle to a mutable structured buffer allocated in global memory. +A structured buffer can be viewed as an array of the specified element type. + @param T The element type of the buffer. + @param L The memory layout of the buffer. + @remarks +The `L` generic parameter is used to specify the memory layout of the buffer when +generating SPIRV. +`L` must be one of `DefaultDataLayout`, `Std140DataLayout`, `Std430DataLayout` or `ScalarDataLayout`. +The default value is `DefaultDataLayout`. +When generating code for other targets, this parameter is ignored and has no effect on the generated code. + @see `StructuredBuffer`, `AppendStructuredBuffer`, `ConsumeStructuredBuffer` + @category buffer_types +**/ +struct $(item.name) +{ + uint DecrementCounter(); + [__readNone] [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedMax64(uint byteAddress, uint64_t value, out uint64_t outOriginalValue) + [require(cpp_cuda_glsl_hlsl_metal_spirv, structuredbuffer_rw)] + void GetDimensions( + out uint numStructs, + out uint stride) { __target_switch { - case hlsl: __intrinsic_asm ".InterlockedMax64"; + case hlsl: __intrinsic_asm ".GetDimensions"; + default: + let rs = __structuredBufferGetDimensions(this); + numStructs = rs.x; + stride = rs.y; } } - // Min + uint IncrementCounter(); + + [__NoSideEffect] + __intrinsic_op($(kIROp_RWStructuredBufferLoad)) + T Load(TIndex location); + + [__NoSideEffect] + __intrinsic_op($(kIROp_RWStructuredBufferLoadStatus)) + T Load(TIndex location, out uint status); - __cuda_sm_version(5.0) - [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda5_int64)] - uint64_t InterlockedMinU64(uint byteAddress, uint64_t value) + __generic + __subscript(TIndex index) -> T { - __target_switch - { - case cuda: __intrinsic_asm "atomicMin($0._getPtrAt($1), $2)"; - case hlsl: - return __asuint64(__atomicMin(this, byteAddress, __asuint2(value))); - case glsl: - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - return __atomicMin(buf[byteAddress / 8], value); - } + // If a 'Buffer[index]' is referred to by a '__ref', call 'kIROp_RWStructuredBufferGetElementPtr(index)'. + // + // This allows call's to stay aware that the input is from a 'Buffer'. + [__NoSideEffect] + [nonmutating] + __intrinsic_op($(kIROp_RWStructuredBufferGetElementPtr)) + ref; } +}; - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedMin64(uint byteAddress, int64_t value) +${{{{ +} +}}}} + +/// @category stage_io +__generic +[require(glsl_hlsl_spirv, geometry)] +__magic_type(HLSLPointStreamType) +__intrinsic_type($(kIROp_HLSLPointStreamType)) +struct PointStream +{ + [KnownBuiltin("GeometryStreamAppend")] + void Append(T value) { __target_switch { - case hlsl: __intrinsic_asm ".InterlockedMin64"; + case glsl: __intrinsic_asm "EmitVertex()"; + case hlsl: __intrinsic_asm ".Append"; + case spirv: spirv_asm { OpEmitVertex; }; } } - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedMin64(uint byteAddress, int64_t value, out int64_t outOriginalValue) + [KnownBuiltin("GeometryStreamRestart")] + void RestartStrip() { __target_switch { - case hlsl: __intrinsic_asm ".InterlockedMin64"; + case glsl: __intrinsic_asm "EndPrimitive()"; + case hlsl: __intrinsic_asm ".RestartStrip"; + case spirv: spirv_asm { OpEndPrimitive; }; } } +}; - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedMin64(uint byteAddress, uint64_t value) +/// @category stage_io +__generic +[require(glsl_hlsl_spirv, geometry)] +__magic_type(HLSLLineStreamType) +__intrinsic_type($(kIROp_HLSLLineStreamType)) +struct LineStream +{ + [KnownBuiltin("GeometryStreamAppend")] + void Append(T value) { __target_switch { - case hlsl: __intrinsic_asm ".InterlockedMin64"; + case glsl: __intrinsic_asm "EmitVertex()"; + case hlsl: __intrinsic_asm ".Append"; + case spirv: spirv_asm { OpEmitVertex; }; } } - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedMin64(uint byteAddress, uint64_t value, out uint64_t outOriginalValue) + [KnownBuiltin("GeometryStreamRestart")] + void RestartStrip() { __target_switch { - case hlsl: __intrinsic_asm ".InterlockedMin64"; + case glsl: __intrinsic_asm "EndPrimitive()"; + case hlsl: __intrinsic_asm ".RestartStrip"; + case spirv: spirv_asm { OpEndPrimitive; }; } } +}; - // And - - __cuda_sm_version(5.0) - [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda5_int64)] - uint64_t InterlockedAndU64(uint byteAddress, uint64_t value) +/// @category stage_io +__generic +[require(glsl_hlsl_spirv, geometry)] +__magic_type(HLSLTriangleStreamType) +__intrinsic_type($(kIROp_HLSLTriangleStreamType)) +struct TriangleStream +{ + [KnownBuiltin("GeometryStreamAppend")] + void Append(T value) { __target_switch { - case cuda: __intrinsic_asm "atomicAnd($0._getPtrAt($1), $2)"; - case hlsl: - return __asuint64(__atomicAnd(this, byteAddress, __asuint2(value))); - case glsl: - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - return __atomicAnd(buf[byteAddress / 8], value); + case glsl: __intrinsic_asm "EmitVertex()"; + case hlsl: __intrinsic_asm ".Append"; + case spirv: spirv_asm { OpEmitVertex; }; } } - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedAnd64(uint byteAddress, uint64_t value) + [KnownBuiltin("GeometryStreamRestart")] + void RestartStrip() { __target_switch { - case hlsl: __intrinsic_asm ".InterlockedAnd64"; - } - } - - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedAnd64(uint byteAddress, uint64_t value, out uint64_t outOriginalValue) - { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedAnd64"; + case glsl: __intrinsic_asm "EndPrimitive()"; + case hlsl: __intrinsic_asm ".RestartStrip"; + case spirv: spirv_asm { OpEndPrimitive; }; } } +}; - // Or +#define VECTOR_MAP_UNARY(TYPE, COUNT, FUNC, VALUE) \ + vector result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(VALUE[i]); } return result - __cuda_sm_version(5.0) - [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda5_int64)] - uint64_t InterlockedOrU64(uint byteAddress, uint64_t value) - { - __target_switch - { - case cuda: __intrinsic_asm "atomicOr($0._getPtrAt($1), $2)"; - case hlsl: - return __asuint64(__atomicOr(this, byteAddress, __asuint2(value))); - case glsl: - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - return __atomicOr(buf[byteAddress / 8], value); - } - } +#define MATRIX_MAP_UNARY(TYPE, ROWS, COLS, FUNC, VALUE) \ + matrix result; for(int i = 0; i < ROWS; ++i) { result[i] = FUNC(VALUE[i]); } return result - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedOr64(uint byteAddress, uint64_t value) - { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedOr64"; - } - } +#define VECTOR_MAP_BINARY(TYPE, COUNT, FUNC, LEFT, RIGHT) \ + vector result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(LEFT[i], RIGHT[i]); } return result - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedOr64(uint byteAddress, uint64_t value, out uint64_t outOriginalValue) - { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedOr64"; - } - } +#define MATRIX_MAP_BINARY(TYPE, ROWS, COLS, FUNC, LEFT, RIGHT) \ + matrix result; for(int i = 0; i < ROWS; ++i) { result[i] = FUNC(LEFT[i], RIGHT[i]); } return result - // Xor +#define VECTOR_MAP_TRINARY(TYPE, COUNT, FUNC, A, B, C) \ + vector result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(A[i], B[i], C[i]); } return result - __cuda_sm_version(5.0) - [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda5_int64)] - uint64_t InterlockedXorU64(uint byteAddress, uint64_t value) - { - __target_switch - { - case cuda: __intrinsic_asm "atomicXor($0._getPtrAt($1), $2)"; - case hlsl: - return __asuint64(__atomicXor(this, byteAddress, __asuint2(value))); - case glsl: - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - return __atomicXor(buf[byteAddress / 8], value); - } - } +#define MATRIX_MAP_TRINARY(TYPE, ROWS, COLS, FUNC, A, B, C) \ + matrix result; for(int i = 0; i < ROWS; ++i) { result[i] = FUNC(A[i], B[i], C[i]); } return result - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedXor64(uint byteAddress, uint64_t value) - { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedXor64"; - } - } +//@public: - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedXor64(uint byteAddress, uint64_t value, out uint64_t outOriginalValue) +/// Try to terminate the current draw or dispatch call (HLSL SM 4.0) +void abort(); + +/// Absolute value (HLSL SM 1.0) +/// @category math +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T abs(T x) +{ + __target_switch { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedXor64"; - } + case hlsl: __intrinsic_asm "abs"; + case glsl: __intrinsic_asm "abs"; + case metal: __intrinsic_asm "abs"; + case cuda: __intrinsic_asm "$P_abs($0)"; + case cpp: __intrinsic_asm "$P_abs($0)"; + case spirv: return spirv_asm { + result:$$T = OpExtInst glsl450 SAbs $x + }; + case wgsl: __intrinsic_asm "abs"; + //default: + // Note: this simple definition may not be appropriate for floating-point inputs + // return x < 0 ? -x : x; } +} - // Exchange - - [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda9_int64)] - uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector abs(vector x) +{ + __target_switch { - __target_switch - { - case cuda: __intrinsic_asm "atomicExch($0._getPtrAt($1), $2)"; - case hlsl: - return __asuint64(__atomicExchange(this, byteAddress, __asuint2(value))); - case glsl: - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - return __atomicExchange(buf[byteAddress / 8], value); - } + case hlsl: __intrinsic_asm "abs"; + case glsl: __intrinsic_asm "abs"; + case metal: __intrinsic_asm "abs"; + case spirv: return spirv_asm { + result:$$vector = OpExtInst glsl450 SAbs $x; + }; + case wgsl: __intrinsic_asm "abs"; + default: + VECTOR_MAP_UNARY(T, N, abs, x); } +} - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedExchangeFloat(uint byteAddress, float value, out float outOriginalValue) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +matrix abs(matrix x) +{ + __target_switch { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedExchangeFloat"; - } + case hlsl: __intrinsic_asm "abs"; + default: + MATRIX_MAP_UNARY(T, N, M, abs, x); } +} - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedExchange64(uint byteAddress, int64_t value, out int64_t outOriginalValue) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T abs(T x) +{ + __target_switch { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedExchange64"; - } + case hlsl: __intrinsic_asm "abs"; + case metal: __intrinsic_asm "abs"; + case glsl: __intrinsic_asm "abs"; + case cuda: __intrinsic_asm "$P_abs($0)"; + case cpp: __intrinsic_asm "$P_abs($0)"; + case spirv: return spirv_asm { + result:$$T = OpExtInst glsl450 FAbs $x; + }; + case wgsl: __intrinsic_asm "abs"; } +} - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedExchange64(uint byteAddress, uint64_t value, out uint64_t outOriginalValue) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector abs(vector x) +{ + __target_switch { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedExchange64"; - } + case hlsl: __intrinsic_asm "abs"; + case metal: __intrinsic_asm "abs"; + case glsl: __intrinsic_asm "abs"; + case spirv: return spirv_asm { + result:$$vector = OpExtInst glsl450 FAbs $x; + }; + case wgsl: __intrinsic_asm "abs"; + default: + VECTOR_MAP_UNARY(T, N, abs, x); } +} - // SM6.6 6 64bit atomics. - [ForceInline] - [require(glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] - void InterlockedAdd64(uint byteAddress, int64_t valueToAdd) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +matrix abs(matrix x) +{ + __target_switch { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedAdd64"; - case glsl: - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - __atomicAdd(buf[byteAddress / 8], valueToAdd); - } + case hlsl: __intrinsic_asm "abs"; + default: + MATRIX_MAP_UNARY(T, N, M, abs, x); } +} - [ForceInline] - [require(glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] - void InterlockedAdd64(uint byteAddress, int64_t valueToAdd, out int64_t outOriginalValue) +/// Absolute value (HLSL SM 1.0) +/// @category math +__generic +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T fabs(T x) +{ + __target_switch { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedAdd64"; - case glsl: - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - outOriginalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd); - return; - } + case metal: __intrinsic_asm "fabs"; + default: + return abs(x); } +} - [ForceInline] - [require(glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] - void InterlockedAdd64(uint byteAddress, uint64_t valueToAdd) +__generic +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector fabs(vector x) +{ + __target_switch { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedAdd64"; - case glsl: - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - __atomicAdd(buf[byteAddress / 8], valueToAdd); - } + case metal: __intrinsic_asm "fabs"; + default: + return abs(x); } +} - [ForceInline] - [require(glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] - void InterlockedAdd64(uint byteAddress, uint64_t valueToAdd, out uint64_t outOriginalValue) + +/// Inverse cosine (HLSL SM 1.0) +/// @category math +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T acos(T x) +{ + __target_switch { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedAdd64"; - case glsl: - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - outOriginalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd); - return; - } + case cpp: __intrinsic_asm "$P_acos($0)"; + case cuda: __intrinsic_asm "$P_acos($0)"; + case glsl: __intrinsic_asm "acos"; + case hlsl: __intrinsic_asm "acos"; + case metal: __intrinsic_asm "acos"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Acos $x + }; + case wgsl: __intrinsic_asm "acos"; } +} - [ForceInline] - [require(glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] - void InterlockedCompareExchange64(uint byteAddress, int64_t compareValue, int64_t value, out int64_t outOriginalValue) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector acos(vector x) +{ + __target_switch { - __target_switch - { - case hlsl: - __cas(this, byteAddress, compareValue, value, outOriginalValue); - return; - case glsl: - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value); - return; - } - } - - [ForceInline] - [require(glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)] - void InterlockedCompareExchange64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue) - { - __target_switch - { - case hlsl: - __cas(this, byteAddress, compareValue, value, outOriginalValue); - return; - case glsl: - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value); - return; - } - } - - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedCompareStoreFloatBitwise(uint byteAddress, float compareValue, float value) - { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedCompareStoreFloatBitwise"; - } - } - - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedCompareExchangeFloatBitwise(uint byteAddress, float compareValue, float value, out float outOriginalValue) - { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedCompareExchangeFloatBitwise"; - } - } - - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedCompareStore64(uint byteAddress, int64_t compareValue, int64_t value) - { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedCompareStore64"; - } - } - - [ForceInline] - [require(hlsl, atomic_hlsl_sm_6_6)] - void InterlockedCompareStore64(uint byteAddress, uint64_t compareValue, uint64_t value) - { - __target_switch - { - case hlsl: __intrinsic_asm ".InterlockedCompareStore64"; - } - } - -${{{{ - } // endif (type == RWByteAddressBuffer) -}}}} - - // Added operations: - [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void InterlockedAdd( - UINT dest, - UINT value, - out UINT original_value) - { - __target_switch - { - case glsl: __intrinsic_asm "($3 = atomicAdd($0._data[$1/4], $2))"; - case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt($1), $2))"; - case hlsl: __intrinsic_asm ".InterlockedAdd"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_add(__getMetalAtomicRef(buf[dest / 4]), value, original_value); - return; - } - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - ::InterlockedAdd(buf[dest / 4], value, original_value); - } - } - - [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void InterlockedAdd( - UINT dest, - UINT value) - { - __target_switch - { - case glsl: __intrinsic_asm "atomicAdd($0._data[$1/4], $2)"; - case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt($1), $2)"; - case hlsl: __intrinsic_asm ".InterlockedAdd"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_add(__getMetalAtomicRef(buf[dest / 4]), value); - return; - } - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - ::InterlockedAdd(buf[dest / 4], value); - } - } - - [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void InterlockedAnd( - UINT dest, - UINT value, - out UINT original_value) - { - __target_switch - { - case glsl: __intrinsic_asm "$3 = atomicAnd($0._data[$1/4], $2)"; - case cuda: __intrinsic_asm "(*$3 = atomicAnd($0._getPtrAt($1), $2))"; - case hlsl: __intrinsic_asm ".InterlockedAnd"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_and(__getMetalAtomicRef(buf[dest / 4]), value, original_value); - return; - } - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - ::InterlockedAnd(buf[dest / 4], value, original_value); - } - } - - [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void InterlockedAnd( - UINT dest, - UINT value) - { - __target_switch - { - case glsl: __intrinsic_asm "atomicAnd($0._data[$1/4], $2)"; - case cuda: __intrinsic_asm "atomicAnd($0._getPtrAt($1), $2)"; - case hlsl: __intrinsic_asm ".InterlockedAnd"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_and(__getMetalAtomicRef(buf[dest / 4]), value); - return; - } - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - ::InterlockedAnd(buf[dest / 4], value); - } - } - - [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void InterlockedCompareExchange( - UINT dest, - UINT compare_value, - UINT value, - out UINT original_value) - { - __target_switch - { - case glsl: __intrinsic_asm "($4 = atomicCompSwap($0._data[$1/4], $2, $3))"; - case cuda: __intrinsic_asm "(*$4 = atomicCAS($0._getPtrAt($1), $2, $3))"; - case hlsl: __intrinsic_asm ".InterlockedCompareExchange"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_compare_exchange(__getMetalAtomicRef(buf[dest / 4]), compare_value, value, original_value); - return; - } - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - ::InterlockedCompareExchange(buf[dest / 4], compare_value, value, original_value); - } - } - - [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void InterlockedCompareStore( - UINT dest, - UINT compare_value, - UINT value) - { - __target_switch - { - case glsl: __intrinsic_asm "atomicCompSwap($0._data[$1/4], $2, $3)"; - case cuda: __intrinsic_asm "atomicCAS($0._getPtrAt($1), $2, $3)"; - case hlsl: __intrinsic_asm ".InterlockedCompareStore"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_compare_exchange(__getMetalAtomicRef(buf[dest / 4]), compare_value, value); - return; - } - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - ::InterlockedCompareStore(buf[dest / 4], compare_value, value); - } - } - - [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void InterlockedExchange( - UINT dest, - UINT value, - out UINT original_value) - { - __target_switch - { - case glsl: __intrinsic_asm "($3 = atomicExchange($0._data[$1/4], $2))"; - case cuda: __intrinsic_asm "(*$3 = atomicExch($0._getPtrAt($1), $2))"; - case hlsl: __intrinsic_asm ".InterlockedExchange"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_exchange(__getMetalAtomicRef(buf[dest / 4]), value, original_value); - return; - } - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - ::InterlockedExchange(buf[dest / 4], value, original_value); - } - } - - [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void InterlockedMax( - UINT dest, - UINT value, - out UINT original_value) - { - __target_switch - { - case glsl: __intrinsic_asm "($3 = atomicMax($0._data[$1/4], $2))"; - case cuda: __intrinsic_asm "(*$3 = atomicMax($0._getPtrAt($1), $2))"; - case hlsl: __intrinsic_asm ".InterlockedMax"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_max(__getMetalAtomicRef(buf[dest / 4]), value, original_value); - return; - } - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - ::InterlockedMax(buf[dest / 4], value, original_value); - } - } - - [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void InterlockedMax( - UINT dest, - UINT value) - { - __target_switch - { - case glsl: __intrinsic_asm "atomicMax($0._data[$1/4], $2)"; - case cuda: __intrinsic_asm "atomicMax($0._getPtrAt($1), $2)"; - case hlsl: __intrinsic_asm ".InterlockedMax"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_max(__getMetalAtomicRef(buf[dest / 4]), value); - return; - } - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - ::InterlockedMax(buf[dest / 4], value); - } - } - - [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void InterlockedMin( - UINT dest, - UINT value, - out UINT original_value) - { - __target_switch - { - case glsl: __intrinsic_asm "($3 = atomicMin($0._data[$1/4], $2))"; - case cuda: __intrinsic_asm "(*$3 = atomicMin($0._getPtrAt($1), $2))"; - case hlsl: __intrinsic_asm ".InterlockedMin"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_min(__getMetalAtomicRef(buf[dest / 4]), value, original_value); - return; - } - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - ::InterlockedMin(buf[dest / 4], value, original_value); - } - } - - [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void InterlockedMin( - UINT dest, - UINT value) - { - __target_switch - { - case glsl: __intrinsic_asm "atomicMin($0._data[$1/4], $2)"; - case cuda: __intrinsic_asm "atomicMin($0._getPtrAt($1), $2)"; - case hlsl: __intrinsic_asm ".InterlockedMin"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_min(__getMetalAtomicRef(buf[dest / 4]), value); - return; - } - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - ::InterlockedMin(buf[dest / 4], value); - } - } - - [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void InterlockedOr( - UINT dest, - UINT value, - out UINT original_value) - { - __target_switch - { - case glsl: __intrinsic_asm "($3 = atomicOr($0._data[$1/4], $2))"; - case cuda: __intrinsic_asm "(*$3 = atomicOr($0._getPtrAt($1), $2))"; - case hlsl: __intrinsic_asm ".InterlockedOr"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_or(__getMetalAtomicRef(buf[dest / 4]), value, original_value); - return; - } - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - ::InterlockedOr(buf[dest / 4], value, original_value); - } - } - - [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void InterlockedOr( - UINT dest, - UINT value) - { - __target_switch - { - case glsl: __intrinsic_asm "atomicOr($0._data[$1/4], $2)"; - case cuda: __intrinsic_asm "atomicOr($0._getPtrAt($1), $2)"; - case hlsl: __intrinsic_asm ".InterlockedOr"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_or(__getMetalAtomicRef(buf[dest / 4]), value); - return; - } - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - ::InterlockedOr(buf[dest / 4], value); - } - } - - [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void InterlockedXor( - UINT dest, - UINT value, - out UINT original_value) - { - __target_switch - { - case glsl: __intrinsic_asm "($3 = atomicXor($0._data[$1/4], $2))"; - case cuda: __intrinsic_asm "(*$3 = atomicXor($0._getPtrAt($1), $2))"; - case hlsl: __intrinsic_asm ".InterlockedXor"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_xor(__getMetalAtomicRef(buf[dest / 4]), value, original_value); - return; - } - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - ::InterlockedXor(buf[dest / 4], value, original_value); - } - } - - [ForceInline] - [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void InterlockedXor( - UINT dest, - UINT value) - { - __target_switch - { - case glsl: __intrinsic_asm "atomicXor($0._data[$1/4], $2)"; - case cuda: __intrinsic_asm "atomicXor($0._getPtrAt($1), $2)"; - case hlsl: __intrinsic_asm ".InterlockedXor"; - case metal: - { - let buf = __getEquivalentStructuredBuffer(this); - __metalInterlocked_xor(__getMetalAtomicRef(buf[dest / 4]), value); - return; - } - case spirv: - let buf = __getEquivalentStructuredBuffer(this); - ::InterlockedXor(buf[dest / 4], value); - } - } - - [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void Store(uint address, uint value) - { - __target_switch - { - case hlsl: __intrinsic_asm ".Store"; - default: - __byteAddressBufferStore(this, address, 0, value); - } - } - - - [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void Store2(uint address, uint2 value) - { - __target_switch - { - case hlsl: __intrinsic_asm ".Store2"; - default: - __byteAddressBufferStore(this, address, 0, value); - } - } - - - [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void Store2(uint address, uint2 value, uint alignment) - { - __target_switch - { - case hlsl: __intrinsic_asm ".Store2"; - default: - __byteAddressBufferStore(this, address, alignment, value); - } - } - - [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void Store2Aligned(uint address, uint2 value) - { - __target_switch - { - case hlsl: __intrinsic_asm ".Store2"; - default: - __byteAddressBufferStore(this, address, __naturalStrideOf(), value); - } - } - - [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void Store3(uint address, uint3 value) - { - __target_switch - { - case hlsl: __intrinsic_asm ".Store3"; - default: - __byteAddressBufferStore(this, address, 0, value); - } - } - - - [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void Store3(uint address, uint3 value, uint alignment) - { - __target_switch - { - case hlsl: __intrinsic_asm ".Store3"; - default: - __byteAddressBufferStore(this, address, alignment, value); - } - } - - [ForceInline] - [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] - void Store3Aligned(uint address, uint3 value) - { - __target_switch - { - case hlsl: __intrinsic_asm ".Store3"; - default: - __byteAddressBufferStore(this, address, __naturalStrideOf(), value); - } - } - - [ForceInline] - [require(cpp_cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] - void Store4(uint address, uint4 value) - { - __target_switch - { - case hlsl: __intrinsic_asm ".Store4"; - default: - __byteAddressBufferStore(this, address, 0, value); - } - } - - - [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void Store4(uint address, uint4 value, uint alignment) - { - __target_switch - { - case hlsl: __intrinsic_asm ".Store4"; - default: - __byteAddressBufferStore(this, address, alignment, value); - } - } - - [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] - void Store4Aligned(uint address, uint4 value) - { - __target_switch - { - case hlsl: __intrinsic_asm ".Store4"; - default: - __byteAddressBufferStore(this, address, __naturalStrideOf(), value); - } - } - - [ForceInline] - void Store(int offset, T value) - { - __byteAddressBufferStore(this, offset, 0, value); - } - - [ForceInline] - void Store(int offset, T value, uint alignment) - { - __byteAddressBufferStore(this, offset, alignment, value); - } - - [ForceInline] - void StoreAligned(int offset, T value) - { - __byteAddressBufferStore(this, offset, __naturalStrideOf(), value); - } -}; - -${{{{ -} -}}}} - -${{{{ -static const struct { - IROp op; - char const* name; -} kMutableStructuredBufferCases[] = -{ - { kIROp_HLSLRWStructuredBufferType, "RWStructuredBuffer" }, - { kIROp_HLSLRasterizerOrderedStructuredBufferType, "RasterizerOrderedStructuredBuffer" }, -}; -for(auto item : kMutableStructuredBufferCases) { -}}}} - -__generic -__magic_type(HLSL$(item.name)Type) -__intrinsic_type($(item.op)) -[require(cpp_cuda_glsl_hlsl_metal_spirv, structuredbuffer_rw)] -/** -Represents an opaque handle to a mutable structured buffer allocated in global memory. -A structured buffer can be viewed as an array of the specified element type. - @param T The element type of the buffer. - @param L The memory layout of the buffer. - @remarks -The `L` generic parameter is used to specify the memory layout of the buffer when -generating SPIRV. -`L` must be one of `DefaultDataLayout`, `Std140DataLayout`, `Std430DataLayout` or `ScalarDataLayout`. -The default value is `DefaultDataLayout`. -When generating code for other targets, this parameter is ignored and has no effect on the generated code. - @see `StructuredBuffer`, `AppendStructuredBuffer`, `ConsumeStructuredBuffer` - @category buffer_types -**/ -struct $(item.name) -{ - uint DecrementCounter(); - - [__readNone] - [ForceInline] - [require(cpp_cuda_glsl_hlsl_metal_spirv, structuredbuffer_rw)] - void GetDimensions( - out uint numStructs, - out uint stride) - { - __target_switch - { - case hlsl: __intrinsic_asm ".GetDimensions"; - default: - let rs = __structuredBufferGetDimensions(this); - numStructs = rs.x; - stride = rs.y; - } - } - - uint IncrementCounter(); - - [__NoSideEffect] - __intrinsic_op($(kIROp_RWStructuredBufferLoad)) - T Load(TIndex location); - - [__NoSideEffect] - __intrinsic_op($(kIROp_RWStructuredBufferLoadStatus)) - T Load(TIndex location, out uint status); - - __generic - __subscript(TIndex index) -> T - { - // If a 'Buffer[index]' is referred to by a '__ref', call 'kIROp_RWStructuredBufferGetElementPtr(index)'. - // - // This allows call's to stay aware that the input is from a 'Buffer'. - [__NoSideEffect] - [nonmutating] - __intrinsic_op($(kIROp_RWStructuredBufferGetElementPtr)) - ref; - } -}; - -${{{{ -} -}}}} - -/// @category stage_io -__generic -[require(glsl_hlsl_spirv, geometry)] -__magic_type(HLSLPointStreamType) -__intrinsic_type($(kIROp_HLSLPointStreamType)) -struct PointStream -{ - [KnownBuiltin("GeometryStreamAppend")] - void Append(T value) - { - __target_switch - { - case glsl: __intrinsic_asm "EmitVertex()"; - case hlsl: __intrinsic_asm ".Append"; - case spirv: spirv_asm { OpEmitVertex; }; - } - } - - [KnownBuiltin("GeometryStreamRestart")] - void RestartStrip() - { - __target_switch - { - case glsl: __intrinsic_asm "EndPrimitive()"; - case hlsl: __intrinsic_asm ".RestartStrip"; - case spirv: spirv_asm { OpEndPrimitive; }; - } - } -}; - -/// @category stage_io -__generic -[require(glsl_hlsl_spirv, geometry)] -__magic_type(HLSLLineStreamType) -__intrinsic_type($(kIROp_HLSLLineStreamType)) -struct LineStream -{ - [KnownBuiltin("GeometryStreamAppend")] - void Append(T value) - { - __target_switch - { - case glsl: __intrinsic_asm "EmitVertex()"; - case hlsl: __intrinsic_asm ".Append"; - case spirv: spirv_asm { OpEmitVertex; }; - } - } - - [KnownBuiltin("GeometryStreamRestart")] - void RestartStrip() - { - __target_switch - { - case glsl: __intrinsic_asm "EndPrimitive()"; - case hlsl: __intrinsic_asm ".RestartStrip"; - case spirv: spirv_asm { OpEndPrimitive; }; - } - } -}; - -/// @category stage_io -__generic -[require(glsl_hlsl_spirv, geometry)] -__magic_type(HLSLTriangleStreamType) -__intrinsic_type($(kIROp_HLSLTriangleStreamType)) -struct TriangleStream -{ - [KnownBuiltin("GeometryStreamAppend")] - void Append(T value) - { - __target_switch - { - case glsl: __intrinsic_asm "EmitVertex()"; - case hlsl: __intrinsic_asm ".Append"; - case spirv: spirv_asm { OpEmitVertex; }; - } - } - - [KnownBuiltin("GeometryStreamRestart")] - void RestartStrip() - { - __target_switch - { - case glsl: __intrinsic_asm "EndPrimitive()"; - case hlsl: __intrinsic_asm ".RestartStrip"; - case spirv: spirv_asm { OpEndPrimitive; }; - } - } -}; - -#define VECTOR_MAP_UNARY(TYPE, COUNT, FUNC, VALUE) \ - vector result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(VALUE[i]); } return result - -#define MATRIX_MAP_UNARY(TYPE, ROWS, COLS, FUNC, VALUE) \ - matrix result; for(int i = 0; i < ROWS; ++i) { result[i] = FUNC(VALUE[i]); } return result - -#define VECTOR_MAP_BINARY(TYPE, COUNT, FUNC, LEFT, RIGHT) \ - vector result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(LEFT[i], RIGHT[i]); } return result - -#define MATRIX_MAP_BINARY(TYPE, ROWS, COLS, FUNC, LEFT, RIGHT) \ - matrix result; for(int i = 0; i < ROWS; ++i) { result[i] = FUNC(LEFT[i], RIGHT[i]); } return result - -#define VECTOR_MAP_TRINARY(TYPE, COUNT, FUNC, A, B, C) \ - vector result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(A[i], B[i], C[i]); } return result - -#define MATRIX_MAP_TRINARY(TYPE, ROWS, COLS, FUNC, A, B, C) \ - matrix result; for(int i = 0; i < ROWS; ++i) { result[i] = FUNC(A[i], B[i], C[i]); } return result - -//@public: - -/// Try to terminate the current draw or dispatch call (HLSL SM 4.0) -void abort(); - -/// Absolute value (HLSL SM 1.0) -/// @category math -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T abs(T x) -{ - __target_switch - { - case hlsl: __intrinsic_asm "abs"; - case glsl: __intrinsic_asm "abs"; - case metal: __intrinsic_asm "abs"; - case cuda: __intrinsic_asm "$P_abs($0)"; - case cpp: __intrinsic_asm "$P_abs($0)"; - case spirv: return spirv_asm { - result:$$T = OpExtInst glsl450 SAbs $x - }; - case wgsl: __intrinsic_asm "abs"; - //default: - // Note: this simple definition may not be appropriate for floating-point inputs - // return x < 0 ? -x : x; - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector abs(vector x) -{ - __target_switch - { - case hlsl: __intrinsic_asm "abs"; - case glsl: __intrinsic_asm "abs"; - case metal: __intrinsic_asm "abs"; - case spirv: return spirv_asm { - result:$$vector = OpExtInst glsl450 SAbs $x; - }; - case wgsl: __intrinsic_asm "abs"; - default: - VECTOR_MAP_UNARY(T, N, abs, x); - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix abs(matrix x) -{ - __target_switch - { - case hlsl: __intrinsic_asm "abs"; - default: - MATRIX_MAP_UNARY(T, N, M, abs, x); - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T abs(T x) -{ - __target_switch - { - case hlsl: __intrinsic_asm "abs"; - case metal: __intrinsic_asm "abs"; - case glsl: __intrinsic_asm "abs"; - case cuda: __intrinsic_asm "$P_abs($0)"; - case cpp: __intrinsic_asm "$P_abs($0)"; - case spirv: return spirv_asm { - result:$$T = OpExtInst glsl450 FAbs $x; - }; - case wgsl: __intrinsic_asm "abs"; - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector abs(vector x) -{ - __target_switch - { - case hlsl: __intrinsic_asm "abs"; - case metal: __intrinsic_asm "abs"; - case glsl: __intrinsic_asm "abs"; - case spirv: return spirv_asm { - result:$$vector = OpExtInst glsl450 FAbs $x; - }; - case wgsl: __intrinsic_asm "abs"; - default: - VECTOR_MAP_UNARY(T, N, abs, x); - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix abs(matrix x) -{ - __target_switch - { - case hlsl: __intrinsic_asm "abs"; - default: - MATRIX_MAP_UNARY(T, N, M, abs, x); - } -} - -/// Absolute value (HLSL SM 1.0) -/// @category math -__generic -[__readNone] -[ForceInline] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T fabs(T x) -{ - __target_switch - { - case metal: __intrinsic_asm "fabs"; - default: - return abs(x); - } -} - -__generic -[__readNone] -[ForceInline] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector fabs(vector x) -{ - __target_switch - { - case metal: __intrinsic_asm "fabs"; - default: - return abs(x); - } -} - - -/// Inverse cosine (HLSL SM 1.0) -/// @category math -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T acos(T x) -{ - __target_switch - { - case cpp: __intrinsic_asm "$P_acos($0)"; - case cuda: __intrinsic_asm "$P_acos($0)"; - case glsl: __intrinsic_asm "acos"; - case hlsl: __intrinsic_asm "acos"; - case metal: __intrinsic_asm "acos"; - case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Acos $x - }; - case wgsl: __intrinsic_asm "acos"; - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector acos(vector x) -{ - __target_switch - { - case glsl: __intrinsic_asm "acos"; - case hlsl: __intrinsic_asm "acos"; - case metal: __intrinsic_asm "acos"; - case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Acos $x - }; - case wgsl: __intrinsic_asm "acos"; - default: - VECTOR_MAP_UNARY(T, N, acos, x); - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix acos(matrix x) -{ - __target_switch - { - case hlsl: __intrinsic_asm "acos"; - default: - MATRIX_MAP_UNARY(T, N, M, acos, x); - } -} - -/// Inverse hyperbolic cosine -/// @category math -__generic -[__readNone] -[ForceInline] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T acosh(T x) -{ - __target_switch - { - case cpp: __intrinsic_asm "$P_acosh($0)"; - case cuda: __intrinsic_asm "$P_acosh($0)"; - case glsl: __intrinsic_asm "acosh"; - case metal: __intrinsic_asm "acosh"; - case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Acosh $x - }; - case wgsl: __intrinsic_asm "acosh"; - default: - return log(x + sqrt( x * x - T(1))); - } -} - -__generic -[__readNone] -[ForceInline] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector acosh(vector x) -{ - __target_switch - { - case glsl: __intrinsic_asm "acosh"; - case metal: __intrinsic_asm "acosh"; - case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Acosh $x - }; - case wgsl: __intrinsic_asm "acosh"; - default: - VECTOR_MAP_UNARY(T, N, acosh, x); - } -} - - -// Test if all components are non-zero (HLSL SM 1.0) -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl)] -bool all(T x) -{ - __target_switch - { - default: - __intrinsic_asm "bool($0)"; - case hlsl: - __intrinsic_asm "all"; - case metal: - __intrinsic_asm "all"; - case wgsl: - __intrinsic_asm "all"; - case spirv: - let zero = __default(); - if (__isInt()) - return spirv_asm - { - OpINotEqual $$bool result $x $zero - }; - else if (__isFloat()) - return spirv_asm - { - OpFUnordNotEqual $$bool result $x $zero - }; - else if (__isBool()) - return __slang_noop_cast(x); - else - return false; - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl)] -bool all(vector x) -{ - if(N == 1) - return all(x[0]); - __target_switch - { - case hlsl: - __intrinsic_asm "all"; - case metal: - __intrinsic_asm "all"; - case glsl: - __intrinsic_asm "all(bvec$N0($0))"; - case spirv: - if (__isBool()) - return spirv_asm - { - OpAll $$bool result $x - }; - else if (__isInt()) - { - let zero = __default>(); - return spirv_asm - { - OpINotEqual $$vector %castResult $x $zero; - OpAll $$bool result %castResult - }; - } - else - { - let zero = __default>(); - return spirv_asm - { - OpFUnordNotEqual $$vector %castResult $x $zero; - OpAll $$bool result %castResult - }; - } - case wgsl: - __intrinsic_asm "all"; - default: - bool result = true; - for(int i = 0; i < N; ++i) - result = result && all(x[i]); - return result; - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv)] -bool all(matrix x) -{ - __target_switch - { - case hlsl: __intrinsic_asm "all"; - default: - bool result = true; - for(int i = 0; i < N; ++i) - result = result && all(x[i]); - return result; - } -} - -/// Barrier for writes to all memory spaces (HLSL SM 5.0) -/// @category barrier Memory and control barriers -__glsl_extension(GL_KHR_memory_scope_semantics) -[require(cuda_glsl_hlsl_metal_spirv_wgsl, memorybarrier)] -void AllMemoryBarrier() -{ - __target_switch - { - case hlsl: __intrinsic_asm "AllMemoryBarrier"; - case glsl: __intrinsic_asm "memoryBarrier(gl_ScopeDevice, (gl_StorageSemanticsShared|gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)"; - case cuda: __intrinsic_asm "__threadfence()"; - case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture | mem_flags::mem_threadgroup_imageblock)"; - case spirv: spirv_asm - { - OpMemoryBarrier Device AcquireRelease|UniformMemory|WorkgroupMemory|ImageMemory; - }; - case wgsl: __intrinsic_asm "storageBarrier(); textureBarrier(); workgroupBarrier();"; - } -} - -/// Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0) -/// @category barrier -__glsl_extension(GL_KHR_memory_scope_semantics) -[require(cuda_glsl_hlsl_metal_spirv_wgsl, memorybarrier)] -void AllMemoryBarrierWithGroupSync() -{ - __target_switch - { - case hlsl: __intrinsic_asm "AllMemoryBarrierWithGroupSync"; - case glsl: __intrinsic_asm "controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, (gl_StorageSemanticsShared|gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)"; - case cuda: __intrinsic_asm "__syncthreads()"; - case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture | mem_flags::mem_threadgroup_imageblock)"; - case spirv: spirv_asm - { - OpControlBarrier Workgroup Device AcquireRelease|UniformMemory|WorkgroupMemory|ImageMemory; - }; - case wgsl: __intrinsic_asm "storageBarrier(); textureBarrier(); workgroupBarrier();"; - } -} - -// Returns the workgroup size of the calling entry point. -[require(compute)] -__intrinsic_op($(kIROp_GetWorkGroupSize)) -int3 WorkgroupSize(); - -// Test if any components is non-zero (HLSL SM 1.0) - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl)] -bool any(T x) -{ - __target_switch - { - default: - __intrinsic_asm "bool($0)"; - case hlsl: - __intrinsic_asm "any"; - case metal: - __intrinsic_asm "any"; - case wgsl: - __intrinsic_asm "any"; - case spirv: - let zero = __default(); - if (__isInt()) - return spirv_asm - { - OpINotEqual $$bool result $x $zero - }; - else if (__isFloat()) - return spirv_asm - { - OpFUnordNotEqual $$bool result $x $zero - }; - else if (__isBool()) - return __slang_noop_cast(x); - return false; - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl)] -bool any(vector x) -{ - if(N == 1) - return any(x[0]); - __target_switch - { - case hlsl: - __intrinsic_asm "any"; - case metal: - __intrinsic_asm "any"; - case glsl: - __intrinsic_asm "any(bvec$N0($0))"; - case spirv: - if (__isBool()) - return spirv_asm - { - OpAny $$bool result $x - }; - else if (__isInt()) - { - let zero = __default>(); - return spirv_asm - { - OpINotEqual $$vector %castResult $x $zero; - OpAny $$bool result %castResult - }; - } - else - { - let zero = __default>(); - return spirv_asm - { - OpFUnordNotEqual $$vector %castResult $x $zero; - OpAny $$bool result %castResult - }; - } - case wgsl: - __intrinsic_asm "any"; - default: - bool result = false; - for(int i = 0; i < N; ++i) - result = result || any(x[i]); - return result; - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_spirv)] -bool any(matrix x) -{ - __target_switch - { - case hlsl: __intrinsic_asm "any"; - default: - bool result = false; - for(int i = 0; i < N; ++i) - result = result || any(x[i]); - return result; - } -} - - -/// Reinterpret bits as a double (HLSL SM 5.0) -/// @category conversion -__glsl_extension(GL_ARB_gpu_shader5) -[__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -double asdouble(uint lowbits, uint highbits) -{ - __target_switch - { - case hlsl: __intrinsic_asm "asdouble"; - case glsl: __intrinsic_asm "packDouble2x32(uvec2($0, $1))"; - case cpp: __intrinsic_asm "$P_asdouble($0, $1)"; - case cuda: __intrinsic_asm "$P_asdouble($0, $1)"; - case spirv: return spirv_asm { - %v:$$uint2 = OpCompositeConstruct $lowbits $highbits; - result:$$double = OpExtInst glsl450 59 %v - }; - } -} - -__glsl_extension(GL_ARB_gpu_shader5) -[__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -double2 asdouble(uint2 lowbits, uint2 highbits) -{ - __target_switch - { - case hlsl: - __intrinsic_asm "asdouble($0, $1)"; - default: - return double2(asdouble(lowbits.x, highbits.x), asdouble(lowbits.y, highbits.y)); - } -} - -/// Reinterpret bits as a float (HLSL SM 4.0) -/// @category conversion -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] -float asfloat(int x) -{ - __target_switch - { - case cpp: __intrinsic_asm "$P_asfloat($0)"; - case cuda: __intrinsic_asm "$P_asfloat($0)"; - case glsl: __intrinsic_asm "intBitsToFloat"; - case hlsl: __intrinsic_asm "asfloat"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; - case spirv: return spirv_asm { - OpBitcast $$float result $x - }; - case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; - } -} - -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] -float asfloat(uint x) -{ - __target_switch - { - case cpp: __intrinsic_asm "$P_asfloat($0)"; - case cuda: __intrinsic_asm "$P_asfloat($0)"; - case glsl: __intrinsic_asm "uintBitsToFloat"; - case hlsl: __intrinsic_asm "asfloat"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; - case spirv: return spirv_asm { - OpBitcast $$float result $x - }; - case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] -vector asfloat(vector< int, N> x) -{ - __target_switch - { - case glsl: __intrinsic_asm "intBitsToFloat"; - case hlsl: __intrinsic_asm "asfloat"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; - case spirv: return spirv_asm { - OpBitcast $$vector result $x - }; - case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; - default: - VECTOR_MAP_UNARY(float, N, asfloat, x); - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] -vector asfloat(vector x) -{ - __target_switch - { - case glsl: __intrinsic_asm "uintBitsToFloat"; - case hlsl: __intrinsic_asm "asfloat"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; - case spirv: return spirv_asm { - OpBitcast $$vector result $x - }; - case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; - default: - VECTOR_MAP_UNARY(float, N, asfloat, x); - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -matrix asfloat(matrix< int,N,M> x) -{ - __target_switch - { - case hlsl: __intrinsic_asm "asfloat"; - default: - MATRIX_MAP_UNARY(float, N, M, asfloat, x); - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -matrix asfloat(matrix x) -{ - __target_switch - { - case hlsl: __intrinsic_asm "asfloat"; - default: - MATRIX_MAP_UNARY(float, N, M, asfloat, x); - } -} - -[__unsafeForceInlineEarly] -[__readNone] -float asfloat(float x) -{ return x; } - -__generic -[__unsafeForceInlineEarly] -[__readNone] -vector asfloat(vector x) -{ return x; } - -__generic -[__unsafeForceInlineEarly] -[__readNone] -matrix asfloat(matrix x) -{ return x; } - -/// Inverse sine (HLSL SM 1.0) -/// @category math -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T asin(T x) -{ - __target_switch - { - case cpp: __intrinsic_asm "$P_asin($0)"; - case cuda: __intrinsic_asm "$P_asin($0)"; - case glsl: __intrinsic_asm "asin"; - case hlsl: __intrinsic_asm "asin"; - case metal: __intrinsic_asm "asin"; - case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Asin $x - }; - case wgsl: __intrinsic_asm "asin"; - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector asin(vector x) -{ - __target_switch - { - case glsl: __intrinsic_asm "asin"; - case hlsl: __intrinsic_asm "asin"; - case metal: __intrinsic_asm "asin"; - case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Asin $x - }; - case wgsl: __intrinsic_asm "asin"; - default: - VECTOR_MAP_UNARY(T,N,asin,x); - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix asin(matrix x) -{ - __target_switch - { - case hlsl: __intrinsic_asm "asin"; - default: - MATRIX_MAP_UNARY(T,N,M,asin,x); - } -} - -/// Inverse hyperbolic sine. -/// @category math -__generic -[__readNone] -[ForceInline] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T asinh(T x) -{ - __target_switch - { - case cpp: __intrinsic_asm "$P_asinh($0)"; - case cuda: __intrinsic_asm "$P_asinh($0)"; - case glsl: __intrinsic_asm "asinh"; - case metal: __intrinsic_asm "asinh"; - case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Asinh $x - }; - case wgsl: __intrinsic_asm "asinh"; - default: - return log(x + sqrt(x * x + T(1))); - } -} - -__generic -[__readNone] -[ForceInline] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector asinh(vector x) -{ - __target_switch - { - case glsl: __intrinsic_asm "asinh"; - case metal: __intrinsic_asm "asinh"; - case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Asinh $x - }; - case wgsl: __intrinsic_asm "asinh"; - default: - VECTOR_MAP_UNARY(T, N, asinh, x); - } -} - -/// Reinterpret bits as an int (HLSL SM 4.0) -/// @category conversion -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] -int asint(float x) -{ - __target_switch - { - case cpp: __intrinsic_asm "$P_asint($0)"; - case cuda: __intrinsic_asm "$P_asint($0)"; - case glsl: __intrinsic_asm "floatBitsToInt"; - case hlsl: __intrinsic_asm "asint"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; - case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; - case spirv: return spirv_asm { - OpBitcast $$int result $x - }; - } -} - -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] -int asint(uint x) -{ - __target_switch - { - case cpp: __intrinsic_asm "$P_asint($0)"; - case cuda: __intrinsic_asm "$P_asint($0)"; - case glsl: __intrinsic_asm "int($0)"; - case hlsl: __intrinsic_asm "asint"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; - case spirv: return spirv_asm { - OpBitcast $$int result $x - }; - case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] -vector asint(vector x) -{ - __target_switch - { - case glsl: __intrinsic_asm "floatBitsToInt"; - case hlsl: __intrinsic_asm "asint"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; - case spirv: return spirv_asm { - OpBitcast $$vector result $x - }; - case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; - default: - VECTOR_MAP_UNARY(int, N, asint, x); - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] -vector asint(vector x) -{ - if(N == 1) - return vector(asint(x[0])); - __target_switch - { - case glsl: __intrinsic_asm "ivec$N0($0)"; - case hlsl: __intrinsic_asm "asint"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; - case spirv: return spirv_asm { - OpBitcast $$vector result $x - }; - case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; - default: - VECTOR_MAP_UNARY(int, N, asint, x); - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] -matrix asint(matrix x) -{ - __target_switch - { - case hlsl: __intrinsic_asm "asint"; - default: - MATRIX_MAP_UNARY(int, N, M, asint, x); - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] -matrix asint(matrix x) -{ - __target_switch - { - case hlsl: __intrinsic_asm "asint"; - default: - MATRIX_MAP_UNARY(int, N, M, asint, x); - } -} - -// No op -[__unsafeForceInlineEarly] -[__readNone] -int asint(int x) -{ return x; } - -__generic -[__unsafeForceInlineEarly] -[__readNone] -vector asint(vector x) -{ return x; } - -__generic -[__unsafeForceInlineEarly] -[__readNone] -matrix asint(matrix x) -{ return x; } - -/// Reinterpret bits of double as a uint (HLSL SM 5.0) -/// @category conversion -__glsl_extension(GL_ARB_gpu_shader5) -[__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] -void asuint(double value, out uint lowbits, out uint highbits) -{ - __target_switch - { - case hlsl: __intrinsic_asm "asuint"; - case glsl: __intrinsic_asm "{ uvec2 v = unpackDouble2x32($0); $1 = v.x; $2 = v.y; }"; - case cpp: - case cuda: - __intrinsic_asm "$P_asuint($0, $1, $2)"; - case spirv: - let uv = spirv_asm - { - result : $$uint2 = OpBitcast $value; - }; - lowbits = uv.x; - highbits = uv.y; - return; - } -} - -// Reinterpret bits as a uint (HLSL SM 4.0) - -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] -uint asuint(float x) -{ - __target_switch - { - case cpp: __intrinsic_asm "$P_asuint($0)"; - case cuda: __intrinsic_asm "$P_asuint($0)"; - case glsl: __intrinsic_asm "floatBitsToUint"; - case hlsl: __intrinsic_asm "asuint"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; - case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; - case spirv: return spirv_asm { - OpBitcast $$uint result $x - }; - } -} - -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] -uint asuint(int x) -{ - __target_switch - { - case cpp: __intrinsic_asm "$P_asuint($0)"; - case cuda: __intrinsic_asm "$P_asuint($0)"; - case glsl: __intrinsic_asm "uint($0)"; - case hlsl: __intrinsic_asm "asuint"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; - case spirv: return spirv_asm { - OpBitcast $$uint result $x - }; - case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] -vector asuint(vector x) -{ - __target_switch - { - case glsl: __intrinsic_asm "floatBitsToUint"; - case hlsl: __intrinsic_asm "asuint"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; - case spirv: return spirv_asm { - OpBitcast $$vector result $x - }; - default: - VECTOR_MAP_UNARY(uint, N, asuint, x); - case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] -vector asuint(vector x) -{ - if(N == 1) - return vector(asuint(x[0])); - __target_switch - { - case glsl: __intrinsic_asm "uvec$N0($0)"; - case hlsl: __intrinsic_asm "asuint"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; + case glsl: __intrinsic_asm "acos"; + case hlsl: __intrinsic_asm "acos"; + case metal: __intrinsic_asm "acos"; case spirv: return spirv_asm { - OpBitcast $$vector result $x + OpExtInst $$vector result glsl450 Acos $x }; - case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; - default: - VECTOR_MAP_UNARY(uint, N, asuint, x); - } -} - -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] -matrix asuint(matrix x) -{ - __target_switch - { - case hlsl: __intrinsic_asm "asuint"; + case wgsl: __intrinsic_asm "acos"; default: - MATRIX_MAP_UNARY(uint, N, M, asuint, x); + VECTOR_MAP_UNARY(T, N, acos, x); } } -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] -matrix asuint(matrix x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +matrix acos(matrix x) { __target_switch { - case hlsl: __intrinsic_asm "asuint"; + case hlsl: __intrinsic_asm "acos"; default: - MATRIX_MAP_UNARY(uint, N, M, asuint, x); - } -} - -[__unsafeForceInlineEarly] -[__readNone] -uint asuint(uint x) -{ return x; } - -__generic -[__unsafeForceInlineEarly] -[__readNone] -vector asuint(vector x) -{ return x; } - -__generic -[__unsafeForceInlineEarly] -[__readNone] -matrix asuint(matrix x) -{ return x; } - - -// 16-bit bitcast ops (HLSL SM 6.2) -// -// TODO: We need to map these to GLSL/SPIR-V -// operations that don't require an intermediate -// conversion to fp32. - -// Identity cases: - -/// Reinterpret bits as a float16 (HLSL SM 6.2). -/// @category conversion -[__unsafeForceInlineEarly][__readNone] float16_t asfloat16(float16_t value) { return value; } -[__unsafeForceInlineEarly][__readNone] vector asfloat16(vector value) { return value; } -[__unsafeForceInlineEarly][__readNone] matrix asfloat16(matrix value) { return value; } - -/// Reinterpret bits as a int16_t (HLSL SM 6.2). -/// @category conversion -[__unsafeForceInlineEarly][__readNone] int16_t asint16(int16_t value) { return value; } -[__unsafeForceInlineEarly][__readNone] vector asint16(vector value) { return value; } -[__unsafeForceInlineEarly][__readNone] matrix asint16(matrix value) { return value; } - -/// Reinterpret bits as a uint16_t (HLSL SM 6.2). -/// @category conversion -[__unsafeForceInlineEarly][__readNone] uint16_t asuint16(uint16_t value) { return value; } -[__unsafeForceInlineEarly][__readNone] vector asuint16(vector value) { return value; } -[__unsafeForceInlineEarly][__readNone] matrix asuint16(matrix value) { return value; } - -// Signed<->unsigned cases: - -[__unsafeForceInlineEarly][__readNone] int16_t asint16(uint16_t value) { return value; } -[__unsafeForceInlineEarly][__readNone] vector asint16(vector value) { return value; } -[__unsafeForceInlineEarly][__readNone] matrix asint16(matrix value) { return value; } - -[__unsafeForceInlineEarly][__readNone] uint16_t asuint16(int16_t value) { return value; } -[__unsafeForceInlineEarly][__readNone] vector asuint16(vector value) { return value; } -[__unsafeForceInlineEarly][__readNone] matrix asuint16(matrix value) { return value; } - -// Float->unsigned cases: - -[__readNone] -[require(cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -uint16_t asuint16(float16_t value) -{ - __target_switch - { - case cuda: __intrinsic_asm "__half_as_ushort"; - case glsl: __intrinsic_asm "uint16_t(packHalf2x16(vec2($0, 0.0)))"; - case hlsl: __intrinsic_asm "asuint16"; - case spirv: return spirv_asm { - OpBitcast $$uint16_t result $value - }; + MATRIX_MAP_UNARY(T, N, M, acos, x); } } +/// Inverse hyperbolic cosine +/// @category math +__generic [__readNone] -[require(cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -vector asuint16(vector value) +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T acosh(T x) { __target_switch { - case hlsl: __intrinsic_asm "asuint16"; + case cpp: __intrinsic_asm "$P_acosh($0)"; + case cuda: __intrinsic_asm "$P_acosh($0)"; + case glsl: __intrinsic_asm "acosh"; + case metal: __intrinsic_asm "acosh"; case spirv: return spirv_asm { - result:$$vector = OpBitcast $value + OpExtInst $$T result glsl450 Acosh $x }; + case wgsl: __intrinsic_asm "acosh"; default: - VECTOR_MAP_UNARY(uint16_t, N, asuint16, value); - } -} - -[__readNone] -[require(cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -matrix asuint16(matrix value) -{ MATRIX_MAP_UNARY(uint16_t, R, C, asuint16, value); } - -// Unsigned->float cases: - -[__readNone] -[require(cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -float16_t asfloat16(uint16_t value) -{ - __target_switch - { - case cuda: __intrinsic_asm "__ushort_as_half"; - case glsl: __intrinsic_asm "float16_t(unpackHalf2x16($0).x)"; - case hlsl: __intrinsic_asm "asfloat16"; - case spirv: return spirv_asm { - OpBitcast $$float16_t result $value - }; + return log(x + sqrt( x * x - T(1))); } } +__generic [__readNone] -[require(cuda_glsl_hlsl_spirv, shader5_sm_5_0)] -vector asfloat16(vector value) +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector acosh(vector x) { __target_switch { - case hlsl: __intrinsic_asm "asfloat16"; + case glsl: __intrinsic_asm "acosh"; + case metal: __intrinsic_asm "acosh"; case spirv: return spirv_asm { - result:$$vector = OpBitcast $value + OpExtInst $$vector result glsl450 Acosh $x }; + case wgsl: __intrinsic_asm "acosh"; default: - VECTOR_MAP_UNARY(float16_t, N, asfloat16, value); + VECTOR_MAP_UNARY(T, N, acosh, x); } } -[__readNone] -matrix asfloat16(matrix value) -{ MATRIX_MAP_UNARY(float16_t, R, C, asfloat16, value); } - -// Float<->signed cases: -[__unsafeForceInlineEarly] +// Test if all components are non-zero (HLSL SM 1.0) +__generic [__readNone] -[require(cuda_hlsl_metal_spirv, shader5_sm_5_0)] -int16_t asint16(float16_t value) -{ - __target_switch - { - case cuda: __intrinsic_asm "__half_as_short"; - case hlsl: __intrinsic_asm "asint16"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; - case spirv: return spirv_asm { - OpBitcast $$int16_t result $value - }; - default: return asuint16(value); - } -} - -[__unsafeForceInlineEarly] -[__readNone] -[require(cuda_hlsl_metal_spirv, shader5_sm_5_0)] -vector asint16(vector value) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl)] +bool all(T x) { __target_switch { - case hlsl: __intrinsic_asm "asint16"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; - default: return asuint16(value); + default: + __intrinsic_asm "bool($0)"; + case hlsl: + __intrinsic_asm "all"; + case metal: + __intrinsic_asm "all"; + case wgsl: + __intrinsic_asm "all"; + case spirv: + let zero = __default(); + if (__isInt()) + return spirv_asm + { + OpINotEqual $$bool result $x $zero + }; + else if (__isFloat()) + return spirv_asm + { + OpFUnordNotEqual $$bool result $x $zero + }; + else if (__isBool()) + return __slang_noop_cast(x); + else + return false; } } -[__unsafeForceInlineEarly] +__generic [__readNone] -[require(cuda_hlsl_spirv, shader5_sm_5_0)] -matrix asint16(matrix value) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl)] +bool all(vector x) { + if(N == 1) + return all(x[0]); __target_switch { - case hlsl: __intrinsic_asm "asint16"; - default: return asuint16(value); + case hlsl: + __intrinsic_asm "all"; + case metal: + __intrinsic_asm "all"; + case glsl: + __intrinsic_asm "all(bvec$N0($0))"; + case spirv: + if (__isBool()) + return spirv_asm + { + OpAll $$bool result $x + }; + else if (__isInt()) + { + let zero = __default>(); + return spirv_asm + { + OpINotEqual $$vector %castResult $x $zero; + OpAll $$bool result %castResult + }; + } + else + { + let zero = __default>(); + return spirv_asm + { + OpFUnordNotEqual $$vector %castResult $x $zero; + OpAll $$bool result %castResult + }; + } + case wgsl: + __intrinsic_asm "all"; + default: + bool result = true; + for(int i = 0; i < N; ++i) + result = result && all(x[i]); + return result; } } +__generic [__readNone] -[__unsafeForceInlineEarly] -[require(cuda_hlsl_metal_spirv, shader5_sm_5_0)] -float16_t asfloat16(int16_t value) +[require(cpp_cuda_glsl_hlsl_metal_spirv)] +bool all(matrix x) { __target_switch { - case cuda: __intrinsic_asm "__short_as_half"; - case hlsl: __intrinsic_asm "asfloat16"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; - case spirv: return spirv_asm { - OpBitcast $$float16_t result $value - }; - default: return asfloat16(asuint16(value)); + case hlsl: __intrinsic_asm "all"; + default: + bool result = true; + for(int i = 0; i < N; ++i) + result = result && all(x[i]); + return result; } } -[__unsafeForceInlineEarly] -[__readNone] -[require(cuda_hlsl_metal_spirv, shader5_sm_5_0)] -vector asfloat16(vector value) +/// Barrier for writes to all memory spaces (HLSL SM 5.0) +/// @category barrier Memory and control barriers +__glsl_extension(GL_KHR_memory_scope_semantics) +[require(cuda_glsl_hlsl_metal_spirv_wgsl, memorybarrier)] +void AllMemoryBarrier() { __target_switch { - case hlsl: __intrinsic_asm "asfloat16"; - case metal: __intrinsic_asm "as_type<$TR>($0)"; - case spirv: return spirv_asm { - OpBitcast $$vector result $value - }; - default: return asfloat16(asuint16(value)); + case hlsl: __intrinsic_asm "AllMemoryBarrier"; + case glsl: __intrinsic_asm "memoryBarrier(gl_ScopeDevice, (gl_StorageSemanticsShared|gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)"; + case cuda: __intrinsic_asm "__threadfence()"; + case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture | mem_flags::mem_threadgroup_imageblock)"; + case spirv: spirv_asm + { + OpMemoryBarrier Device AcquireRelease|UniformMemory|WorkgroupMemory|ImageMemory; + }; + case wgsl: __intrinsic_asm "storageBarrier(); textureBarrier(); workgroupBarrier();"; } } -[__unsafeForceInlineEarly] -[__readNone] -[require(cuda_hlsl_spirv, shader5_sm_5_0)] -matrix asfloat16(matrix value) +/// Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0) +/// @category barrier +__glsl_extension(GL_KHR_memory_scope_semantics) +[require(cuda_glsl_hlsl_metal_spirv_wgsl, memorybarrier)] +void AllMemoryBarrierWithGroupSync() { __target_switch { - case hlsl: __intrinsic_asm "asfloat16"; - default: return asfloat16(asuint16(value)); + case hlsl: __intrinsic_asm "AllMemoryBarrierWithGroupSync"; + case glsl: __intrinsic_asm "controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, (gl_StorageSemanticsShared|gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)"; + case cuda: __intrinsic_asm "__syncthreads()"; + case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture | mem_flags::mem_threadgroup_imageblock)"; + case spirv: spirv_asm + { + OpControlBarrier Workgroup Device AcquireRelease|UniformMemory|WorkgroupMemory|ImageMemory; + }; + case wgsl: __intrinsic_asm "storageBarrier(); textureBarrier(); workgroupBarrier();"; } } -/// Inverse tangent (HLSL SM 1.0). -/// @category math -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T atan(T x) -{ - __target_switch - { - case cpp: __intrinsic_asm "$P_atan($0)"; - case cuda: __intrinsic_asm "$P_atan($0)"; - case glsl: __intrinsic_asm "atan"; - case hlsl: __intrinsic_asm "atan"; - case metal: __intrinsic_asm "atan"; - case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Atan $x - }; - case wgsl: __intrinsic_asm "atan"; - } -} +// Returns the workgroup size of the calling entry point. +[require(compute)] +__intrinsic_op($(kIROp_GetWorkGroupSize)) +int3 WorkgroupSize(); -__generic +// Test if any components is non-zero (HLSL SM 1.0) + +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector atan(vector x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl)] +bool any(T x) { __target_switch { - case glsl: __intrinsic_asm "atan"; - case hlsl: __intrinsic_asm "atan"; - case metal: __intrinsic_asm "atan"; - case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Atan $x - }; - case wgsl: __intrinsic_asm "atan"; default: - VECTOR_MAP_UNARY(T, N, atan, x); + __intrinsic_asm "bool($0)"; + case hlsl: + __intrinsic_asm "any"; + case metal: + __intrinsic_asm "any"; + case wgsl: + __intrinsic_asm "any"; + case spirv: + let zero = __default(); + if (__isInt()) + return spirv_asm + { + OpINotEqual $$bool result $x $zero + }; + else if (__isFloat()) + return spirv_asm + { + OpFUnordNotEqual $$bool result $x $zero + }; + else if (__isBool()) + return __slang_noop_cast(x); + return false; } } -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix atan(matrix x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl)] +bool any(vector x) { + if(N == 1) + return any(x[0]); __target_switch { - case hlsl: __intrinsic_asm "atan"; + case hlsl: + __intrinsic_asm "any"; + case metal: + __intrinsic_asm "any"; + case glsl: + __intrinsic_asm "any(bvec$N0($0))"; + case spirv: + if (__isBool()) + return spirv_asm + { + OpAny $$bool result $x + }; + else if (__isInt()) + { + let zero = __default>(); + return spirv_asm + { + OpINotEqual $$vector %castResult $x $zero; + OpAny $$bool result %castResult + }; + } + else + { + let zero = __default>(); + return spirv_asm + { + OpFUnordNotEqual $$vector %castResult $x $zero; + OpAny $$bool result %castResult + }; + } + case wgsl: + __intrinsic_asm "any"; default: - MATRIX_MAP_UNARY(T, N, M, atan, x); + bool result = false; + for(int i = 0; i < N; ++i) + result = result || any(x[i]); + return result; } } -/// Inverse tangent (HLSL SM 1.0). -/// @category math -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T atan2(T y, T x) +[require(cpp_cuda_glsl_hlsl_spirv)] +bool any(matrix x) { __target_switch { - case cpp: __intrinsic_asm "$P_atan2($0, $1)"; - case cuda: __intrinsic_asm "$P_atan2($0, $1)"; - case glsl: __intrinsic_asm "atan($0,$1)"; - case hlsl: __intrinsic_asm "atan2"; - case metal: __intrinsic_asm "atan2"; - case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Atan2 $y $x - }; - case wgsl: __intrinsic_asm "atan2"; + case hlsl: __intrinsic_asm "any"; + default: + bool result = false; + for(int i = 0; i < N; ++i) + result = result || any(x[i]); + return result; } } -__generic + +/// Reinterpret bits as a double (HLSL SM 5.0) +/// @category conversion +__glsl_extension(GL_ARB_gpu_shader5) [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector atan2(vector y, vector x) +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] +double asdouble(uint lowbits, uint highbits) { __target_switch { - case glsl: __intrinsic_asm "atan($0,$1)"; - case hlsl: __intrinsic_asm "atan2"; - case metal: __intrinsic_asm "atan2"; + case hlsl: __intrinsic_asm "asdouble"; + case glsl: __intrinsic_asm "packDouble2x32(uvec2($0, $1))"; + case cpp: __intrinsic_asm "$P_asdouble($0, $1)"; + case cuda: __intrinsic_asm "$P_asdouble($0, $1)"; case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Atan2 $y $x + %v:$$uint2 = OpCompositeConstruct $lowbits $highbits; + result:$$double = OpExtInst glsl450 59 %v }; - case wgsl: __intrinsic_asm "atan2"; - default: - VECTOR_MAP_BINARY(T, N, atan2, y, x); } } -__generic +__glsl_extension(GL_ARB_gpu_shader5) [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix atan2(matrix y, matrix x) +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] +double2 asdouble(uint2 lowbits, uint2 highbits) { __target_switch { - case hlsl: __intrinsic_asm "atan2"; + case hlsl: + __intrinsic_asm "asdouble($0, $1)"; default: - MATRIX_MAP_BINARY(T, N, M, atan2, y, x); + return double2(asdouble(lowbits.x, highbits.x), asdouble(lowbits.y, highbits.y)); } } -/// Hyperbolic inverse tangent -/// @category math -__generic +/// Reinterpret bits as a float (HLSL SM 4.0) +/// @category conversion [__readNone] -[ForceInline] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T atanh(T x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] +float asfloat(int x) { __target_switch { - case cpp: __intrinsic_asm "$P_atanh($0)"; - case cuda: __intrinsic_asm "$P_atanh($0)"; - case glsl: __intrinsic_asm "atanh"; - case metal: __intrinsic_asm "atanh"; + case cpp: __intrinsic_asm "$P_asfloat($0)"; + case cuda: __intrinsic_asm "$P_asfloat($0)"; + case glsl: __intrinsic_asm "intBitsToFloat"; + case hlsl: __intrinsic_asm "asfloat"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Atanh $x + OpBitcast $$float result $x }; - case wgsl: __intrinsic_asm "atanh"; - default: - return T(0.5) * log((T(1) + x) / (T(1) - x)); + case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; } } -__generic [__readNone] -[ForceInline] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector atanh(vector x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] +float asfloat(uint x) { __target_switch { - case glsl: __intrinsic_asm "atanh"; - case metal: __intrinsic_asm "atanh"; + case cpp: __intrinsic_asm "$P_asfloat($0)"; + case cuda: __intrinsic_asm "$P_asfloat($0)"; + case glsl: __intrinsic_asm "uintBitsToFloat"; + case hlsl: __intrinsic_asm "asfloat"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Atanh $x + OpBitcast $$float result $x }; - case wgsl: __intrinsic_asm "atanh"; - default: - VECTOR_MAP_UNARY(T, N, atanh, x); + case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; } } -/// Ceiling (HLSL SM 1.0). -/// @category math -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T ceil(T x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] +vector asfloat(vector< int, N> x) { __target_switch { - case cpp: __intrinsic_asm "$P_ceil($0)"; - case cuda: __intrinsic_asm "$P_ceil($0)"; - case glsl: __intrinsic_asm "ceil"; - case hlsl: __intrinsic_asm "ceil"; - case metal: __intrinsic_asm "ceil"; + case glsl: __intrinsic_asm "intBitsToFloat"; + case hlsl: __intrinsic_asm "asfloat"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Ceil $x + OpBitcast $$vector result $x }; - case wgsl: __intrinsic_asm "ceil"; + case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; + default: + VECTOR_MAP_UNARY(float, N, asfloat, x); } } -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector ceil(vector x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] +vector asfloat(vector x) { __target_switch { - case glsl: __intrinsic_asm "ceil"; - case hlsl: __intrinsic_asm "ceil"; - case metal: __intrinsic_asm "ceil"; + case glsl: __intrinsic_asm "uintBitsToFloat"; + case hlsl: __intrinsic_asm "asfloat"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Ceil $x + OpBitcast $$vector result $x }; - case wgsl: __intrinsic_asm "ceil"; + case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; default: - VECTOR_MAP_UNARY(T, N, ceil, x); + VECTOR_MAP_UNARY(float, N, asfloat, x); } } -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix ceil(matrix x) +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] +matrix asfloat(matrix< int,N,M> x) { __target_switch { - case hlsl: __intrinsic_asm "ceil"; + case hlsl: __intrinsic_asm "asfloat"; default: - MATRIX_MAP_UNARY(T, N, M, ceil, x); + MATRIX_MAP_UNARY(float, N, M, asfloat, x); } } -// Copy-sign -/// @category math -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv)] -vector copysign_half(vector x, vector y) +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] +matrix asfloat(matrix x) { - let ux = reinterpret>(x); - let uy = reinterpret>(y); - vector signY = (uy & (uint16_t(1) << uint16_t(15))); - vector newX = (ux & ((uint16_t(1) << uint16_t(15)) - uint16_t(1))) + signY; - return reinterpret>(newX); + __target_switch + { + case hlsl: __intrinsic_asm "asfloat"; + default: + MATRIX_MAP_UNARY(float, N, M, asfloat, x); + } } -/// @category math -__generic +[__unsafeForceInlineEarly] [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv)] -vector copysign_float(vector x, vector y) -{ - let ux = reinterpret>(x); - let uy = reinterpret>(y); - vector signY = (uy & (uint32_t(1) << uint32_t(31))); - vector newX = (ux & ((uint32_t(1) << uint32_t(31)) - uint32_t(1))) + signY; - return reinterpret>(newX); -} +float asfloat(float x) +{ return x; } -/// @category math -__generic +__generic +[__unsafeForceInlineEarly] [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv)] -vector copysign_double(vector x, vector y) -{ - let ux = reinterpret>(x); - let uy = reinterpret>(y); - vector signY = (uy & (uint64_t(1) << uint64_t(63))); - vector newX = (ux & ((uint64_t(1) << uint64_t(63)) - uint64_t(1))) + signY; - return reinterpret>(newX); -} - -__generic -__intrinsic_op($(kIROp_FloatCast)) -vector __real_cast(vector val); +vector asfloat(vector x) +{ return x; } -/// @category math -__generic +__generic +[__unsafeForceInlineEarly] [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv)] -vector copysign(vector x, vector y) -{ - __target_switch - { - case metal: __intrinsic_asm "copysign"; - default: - { - // sign of -0.0 needs to be respected. - if (T is half) - return __real_cast(copysign_half( - __real_cast(x), - __real_cast(y))); - if (T is float) - return __real_cast(copysign_float( - __real_cast(x), - __real_cast(y))); - return __real_cast(copysign_double( - __real_cast(x), - __real_cast(y))); - } - } -} +matrix asfloat(matrix x) +{ return x; } +/// Inverse sine (HLSL SM 1.0) +/// @category math __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv)] -T copysign(T x, T y) -{ - __target_switch - { - case metal: __intrinsic_asm "copysign"; - default: - return copysign(vector(x), vector(y))[0]; - } -} - - -// Check access status to tiled resource -[ForceInline] -[require(hlsl, sm_5_0)] -bool CheckAccessFullyMapped(out uint status) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T asin(T x) { __target_switch { - case hlsl: __intrinsic_asm "CheckAccessFullyMapped"; + case cpp: __intrinsic_asm "$P_asin($0)"; + case cuda: __intrinsic_asm "$P_asin($0)"; + case glsl: __intrinsic_asm "asin"; + case hlsl: __intrinsic_asm "asin"; + case metal: __intrinsic_asm "asin"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Asin $x + }; + case wgsl: __intrinsic_asm "asin"; } } -/// Clamp (HLSL SM 1.0). -/// @category math -__generic +__generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T clamp(T x, T minBound, T maxBound) +vector asin(vector x) { __target_switch { - case hlsl: __intrinsic_asm "clamp"; - case glsl: __intrinsic_asm "clamp"; - case metal: __intrinsic_asm "clamp"; - case spirv: - if (__isSignedInt()) - return spirv_asm { - result:$$T = OpExtInst glsl450 SClamp $x $minBound $maxBound - }; - else - return spirv_asm { - result:$$T = OpExtInst glsl450 UClamp $x $minBound $maxBound - }; - case wgsl: __intrinsic_asm "clamp"; + case glsl: __intrinsic_asm "asin"; + case hlsl: __intrinsic_asm "asin"; + case metal: __intrinsic_asm "asin"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Asin $x + }; + case wgsl: __intrinsic_asm "asin"; default: - return min(max(x, minBound), maxBound); + VECTOR_MAP_UNARY(T,N,asin,x); } } -__generic +__generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector clamp(vector x, vector minBound, vector maxBound) +matrix asin(matrix x) { __target_switch { - case hlsl: __intrinsic_asm "clamp"; - case glsl: __intrinsic_asm "clamp"; - case metal: __intrinsic_asm "clamp"; - case spirv: - if (__isSignedInt()) - return spirv_asm { - result:$$vector = OpExtInst glsl450 SClamp $x $minBound $maxBound - }; - else - return spirv_asm { - result:$$vector = OpExtInst glsl450 UClamp $x $minBound $maxBound - }; - case wgsl: __intrinsic_asm "clamp"; + case hlsl: __intrinsic_asm "asin"; default: - return min(max(x, minBound), maxBound); + MATRIX_MAP_UNARY(T,N,M,asin,x); } } -__generic +/// Inverse hyperbolic sine. +/// @category math +__generic [__readNone] +[ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix clamp(matrix x, matrix minBound, matrix maxBound) +T asinh(T x) { __target_switch { - case hlsl: __intrinsic_asm "clamp"; + case cpp: __intrinsic_asm "$P_asinh($0)"; + case cuda: __intrinsic_asm "$P_asinh($0)"; + case glsl: __intrinsic_asm "asinh"; + case metal: __intrinsic_asm "asinh"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Asinh $x + }; + case wgsl: __intrinsic_asm "asinh"; default: - return min(max(x, minBound), maxBound); + return log(x + sqrt(x * x + T(1))); } } -__generic +__generic [__readNone] +[ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T clamp(T x, T minBound, T maxBound) +vector asinh(vector x) { __target_switch { - case hlsl: __intrinsic_asm "clamp"; - case glsl: __intrinsic_asm "clamp"; - case metal: __intrinsic_asm "clamp"; + case glsl: __intrinsic_asm "asinh"; + case metal: __intrinsic_asm "asinh"; case spirv: return spirv_asm { - result:$$T = OpExtInst glsl450 FClamp $x $minBound $maxBound + OpExtInst $$vector result glsl450 Asinh $x }; - case wgsl: __intrinsic_asm "clamp"; + case wgsl: __intrinsic_asm "asinh"; default: - return min(max(x, minBound), maxBound); + VECTOR_MAP_UNARY(T, N, asinh, x); } } -__generic +/// Reinterpret bits as an int (HLSL SM 4.0) +/// @category conversion [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector clamp(vector x, vector minBound, vector maxBound) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] +int asint(float x) { __target_switch { - case hlsl: __intrinsic_asm "clamp"; - case glsl: __intrinsic_asm "clamp"; - case metal: __intrinsic_asm "clamp"; + case cpp: __intrinsic_asm "$P_asint($0)"; + case cuda: __intrinsic_asm "$P_asint($0)"; + case glsl: __intrinsic_asm "floatBitsToInt"; + case hlsl: __intrinsic_asm "asint"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; + case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; case spirv: return spirv_asm { - result:$$vector = OpExtInst glsl450 FClamp $x $minBound $maxBound + OpBitcast $$int result $x }; - case wgsl: __intrinsic_asm "clamp"; - default: - return min(max(x, minBound), maxBound); } } -__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix clamp(matrix x, matrix minBound, matrix maxBound) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] +int asint(uint x) { __target_switch { - case hlsl: __intrinsic_asm "clamp"; - default: - return min(max(x, minBound), maxBound); + case cpp: __intrinsic_asm "$P_asint($0)"; + case cuda: __intrinsic_asm "$P_asint($0)"; + case glsl: __intrinsic_asm "int($0)"; + case hlsl: __intrinsic_asm "asint"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; + case spirv: return spirv_asm { + OpBitcast $$int result $x + }; + case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; } } -/// Clip (discard) fragment conditionally -__generic -[require(cpp_cuda_glsl_hlsl_spirv, fragment)] -void clip(T x) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] +vector asint(vector x) { __target_switch { - case hlsl: __intrinsic_asm "clip"; + case glsl: __intrinsic_asm "floatBitsToInt"; + case hlsl: __intrinsic_asm "asint"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; + case spirv: return spirv_asm { + OpBitcast $$vector result $x + }; + case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; default: - if(x < T(0)) discard; + VECTOR_MAP_UNARY(int, N, asint, x); } } -__generic -[require(cpp_cuda_glsl_hlsl_spirv, fragment)] -void clip(vector x) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] +vector asint(vector x) { + if(N == 1) + return vector(asint(x[0])); __target_switch { - case hlsl: __intrinsic_asm "clip"; + case glsl: __intrinsic_asm "ivec$N0($0)"; + case hlsl: __intrinsic_asm "asint"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; + case spirv: return spirv_asm { + OpBitcast $$vector result $x + }; + case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; default: - if(any(x < T(0))) discard; + VECTOR_MAP_UNARY(int, N, asint, x); } } -__generic -[require(cpp_cuda_glsl_hlsl_spirv, fragment)] -void clip(matrix x) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] +matrix asint(matrix x) { __target_switch { - case hlsl: __intrinsic_asm "clip"; + case hlsl: __intrinsic_asm "asint"; default: - if(any(x < T(0))) discard; + MATRIX_MAP_UNARY(int, N, M, asint, x); } } -/// @category math -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T cos(T x) +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] +matrix asint(matrix x) { __target_switch { - case cpp: __intrinsic_asm "$P_cos($0)"; - case cuda: __intrinsic_asm "$P_cos($0)"; - case glsl: __intrinsic_asm "cos"; - case hlsl: __intrinsic_asm "cos"; - case metal: __intrinsic_asm "cos"; - case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Cos $x - }; - case wgsl: __intrinsic_asm "cos"; + case hlsl: __intrinsic_asm "asint"; + default: + MATRIX_MAP_UNARY(int, N, M, asint, x); } } -__generic +// No op +[__unsafeForceInlineEarly] [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector cos(vector x) +int asint(int x) +{ return x; } + +__generic +[__unsafeForceInlineEarly] +[__readNone] +vector asint(vector x) +{ return x; } + +__generic +[__unsafeForceInlineEarly] +[__readNone] +matrix asint(matrix x) +{ return x; } + +/// Reinterpret bits of double as a uint (HLSL SM 5.0) +/// @category conversion +__glsl_extension(GL_ARB_gpu_shader5) +[__readNone] +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] +void asuint(double value, out uint lowbits, out uint highbits) { __target_switch { - case glsl: __intrinsic_asm "cos"; - case hlsl: __intrinsic_asm "cos"; - case metal: __intrinsic_asm "cos"; - case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Cos $x - }; - case wgsl: __intrinsic_asm "cos"; - default: - VECTOR_MAP_UNARY(T,N, cos, x); + case hlsl: __intrinsic_asm "asuint"; + case glsl: __intrinsic_asm "{ uvec2 v = unpackDouble2x32($0); $1 = v.x; $2 = v.y; }"; + case cpp: + case cuda: + __intrinsic_asm "$P_asuint($0, $1, $2)"; + case spirv: + let uv = spirv_asm + { + result : $$uint2 = OpBitcast $value; + }; + lowbits = uv.x; + highbits = uv.y; + return; } } -__generic +// Reinterpret bits as a uint (HLSL SM 4.0) + [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix cos(matrix x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] +uint asuint(float x) { __target_switch { - case hlsl: __intrinsic_asm "cos"; - default: - MATRIX_MAP_UNARY(T, N, M, cos, x); + case cpp: __intrinsic_asm "$P_asuint($0)"; + case cuda: __intrinsic_asm "$P_asuint($0)"; + case glsl: __intrinsic_asm "floatBitsToUint"; + case hlsl: __intrinsic_asm "asuint"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; + case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; + case spirv: return spirv_asm { + OpBitcast $$uint result $x + }; } } -/// Hyperbolic cosine. -/// @category math -__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T cosh(T x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] +uint asuint(int x) { __target_switch { - case cpp: __intrinsic_asm "$P_cosh($0)"; - case cuda: __intrinsic_asm "$P_cosh($0)"; - case glsl: __intrinsic_asm "cosh"; - case hlsl: __intrinsic_asm "cosh"; - case metal: __intrinsic_asm "cosh"; + case cpp: __intrinsic_asm "$P_asuint($0)"; + case cuda: __intrinsic_asm "$P_asuint($0)"; + case glsl: __intrinsic_asm "uint($0)"; + case hlsl: __intrinsic_asm "asuint"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Cosh $x + OpBitcast $$uint result $x }; - case wgsl: __intrinsic_asm "cosh"; + case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; } } -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector cosh(vector x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] +vector asuint(vector x) { __target_switch { - case glsl: __intrinsic_asm "cosh"; - case hlsl: __intrinsic_asm "cosh"; - case metal: __intrinsic_asm "cosh"; + case glsl: __intrinsic_asm "floatBitsToUint"; + case hlsl: __intrinsic_asm "asuint"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Cosh $x + OpBitcast $$vector result $x }; - case wgsl: __intrinsic_asm "cosh"; default: - VECTOR_MAP_UNARY(T,N, cosh, x); + VECTOR_MAP_UNARY(uint, N, asuint, x); + case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; } } -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix cosh(matrix x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_4_0)] +vector asuint(vector x) { + if(N == 1) + return vector(asuint(x[0])); __target_switch { - case hlsl: __intrinsic_asm "cosh"; + case glsl: __intrinsic_asm "uvec$N0($0)"; + case hlsl: __intrinsic_asm "asuint"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; + case spirv: return spirv_asm { + OpBitcast $$vector result $x + }; + case wgsl: __intrinsic_asm "bitcast<$TR>($0)"; default: - MATRIX_MAP_UNARY(T, N, M, cosh, x); + VECTOR_MAP_UNARY(uint, N, asuint, x); } } -/// Compute the cosine of an angle in degrees. -/// @category math -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T cospi(T x) +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] +matrix asuint(matrix x) { __target_switch { - case metal: __intrinsic_asm "cospi"; + case hlsl: __intrinsic_asm "asuint"; default: - return cos(T.getPi() * x); + MATRIX_MAP_UNARY(uint, N, M, asuint, x); } } -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector cospi(vector x) +[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_4_0)] +matrix asuint(matrix x) { __target_switch { - case metal: __intrinsic_asm "cospi"; + case hlsl: __intrinsic_asm "asuint"; default: - return cos(T.getPi() * x); + MATRIX_MAP_UNARY(uint, N, M, asuint, x); } } +[__unsafeForceInlineEarly] +[__readNone] +uint asuint(uint x) +{ return x; } -/// Population count. -/// @category bitops +__generic +[__unsafeForceInlineEarly] [__readNone] -[ForceInline] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -uint countbits(uint value) -{ - __target_switch - { - case hlsl: - __intrinsic_asm "countbits"; - case glsl: - __intrinsic_asm "bitCount"; - case metal: - __intrinsic_asm "popcount"; - case cuda: - case cpp: - __intrinsic_asm "$P_countbits($0)"; - case spirv: - return spirv_asm {OpBitCount $$uint result $value}; - case wgsl: - __intrinsic_asm "countOneBits"; - } -} +vector asuint(vector x) +{ return x; } -__generic +__generic +[__unsafeForceInlineEarly] [__readNone] -[ForceInline] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -vector countbits(vector value) -{ - __target_switch - { - case hlsl: - __intrinsic_asm "countbits"; - case glsl: - __intrinsic_asm "bitCount"; - case metal: - __intrinsic_asm "popcount"; - case spirv: - return spirv_asm {OpBitCount $$vector result $value}; - case wgsl: - __intrinsic_asm "countOneBits"; - default: - VECTOR_MAP_UNARY(uint, N, countbits, value); - } -} +matrix asuint(matrix x) +{ return x; } + + +// 16-bit bitcast ops (HLSL SM 6.2) +// +// TODO: We need to map these to GLSL/SPIR-V +// operations that don't require an intermediate +// conversion to fp32. + +// Identity cases: + +/// Reinterpret bits as a float16 (HLSL SM 6.2). +/// @category conversion +[__unsafeForceInlineEarly][__readNone] float16_t asfloat16(float16_t value) { return value; } +[__unsafeForceInlineEarly][__readNone] vector asfloat16(vector value) { return value; } +[__unsafeForceInlineEarly][__readNone] matrix asfloat16(matrix value) { return value; } + +/// Reinterpret bits as a int16_t (HLSL SM 6.2). +/// @category conversion +[__unsafeForceInlineEarly][__readNone] int16_t asint16(int16_t value) { return value; } +[__unsafeForceInlineEarly][__readNone] vector asint16(vector value) { return value; } +[__unsafeForceInlineEarly][__readNone] matrix asint16(matrix value) { return value; } + +/// Reinterpret bits as a uint16_t (HLSL SM 6.2). +/// @category conversion +[__unsafeForceInlineEarly][__readNone] uint16_t asuint16(uint16_t value) { return value; } +[__unsafeForceInlineEarly][__readNone] vector asuint16(vector value) { return value; } +[__unsafeForceInlineEarly][__readNone] matrix asuint16(matrix value) { return value; } + +// Signed<->unsigned cases: + +[__unsafeForceInlineEarly][__readNone] int16_t asint16(uint16_t value) { return value; } +[__unsafeForceInlineEarly][__readNone] vector asint16(vector value) { return value; } +[__unsafeForceInlineEarly][__readNone] matrix asint16(matrix value) { return value; } + +[__unsafeForceInlineEarly][__readNone] uint16_t asuint16(int16_t value) { return value; } +[__unsafeForceInlineEarly][__readNone] vector asuint16(vector value) { return value; } +[__unsafeForceInlineEarly][__readNone] matrix asuint16(matrix value) { return value; } + +// Float->unsigned cases: -/// Cross product -/// @category math -__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector cross(vector left, vector right) +[require(cuda_glsl_hlsl_spirv, shader5_sm_5_0)] +uint16_t asuint16(float16_t value) { - // TODO: SPIRV does not support integer vectors. __target_switch { - case glsl: __intrinsic_asm "cross"; - case hlsl: __intrinsic_asm "cross"; - case metal: __intrinsic_asm "cross"; + case cuda: __intrinsic_asm "__half_as_ushort"; + case glsl: __intrinsic_asm "uint16_t(packHalf2x16(vec2($0, 0.0)))"; + case hlsl: __intrinsic_asm "asuint16"; case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Cross $left $right + OpBitcast $$uint16_t result $value }; - case wgsl: __intrinsic_asm "cross"; - default: - return vector( - left.y * right.z - left.z * right.y, - left.z * right.x - left.x * right.z, - left.x * right.y - left.y * right.x); } } -__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector cross(vector left, vector right) +[require(cuda_glsl_hlsl_spirv, shader5_sm_5_0)] +vector asuint16(vector value) { __target_switch { - case glsl: __intrinsic_asm "cross"; - case hlsl: __intrinsic_asm "cross"; + case hlsl: __intrinsic_asm "asuint16"; case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Cross $left $right + result:$$vector = OpBitcast $value }; - case wgsl: __intrinsic_asm "cross"; default: - return vector( - left.y * right.z - left.z * right.y, - left.z * right.x - left.x * right.z, - left.x * right.y - left.y * right.x); + VECTOR_MAP_UNARY(uint16_t, N, asuint16, value); } } -// Convert encoded color [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -int4 D3DCOLORtoUBYTE4(float4 color) -{ - __target_switch - { - case hlsl: __intrinsic_asm "D3DCOLORtoUBYTE4"; - case wgsl: __intrinsic_asm "bitcast(pack4x8unorm($0)).zyxw"; - default: - let scaled = color.zyxw * 255.001999f; - return int4(scaled); - } -} +[require(cuda_glsl_hlsl_spirv, shader5_sm_5_0)] +matrix asuint16(matrix value) +{ MATRIX_MAP_UNARY(uint16_t, R, C, asuint16, value); } -// Partial-difference derivatives -${{{{ -const char* diffDimensions[2] = {"x", "y"}; -for (auto xOrY : diffDimensions) { -}}}} -/// @category derivative Derivative functions -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, fragmentprocessing)] -T dd$(xOrY)(T x) -{ - __requireComputeDerivative(); - __target_switch - { - case hlsl: - case cpp: - case cuda: - __intrinsic_asm "dd$(xOrY)"; - case glsl: - __intrinsic_asm "dFd$(xOrY)"; - case metal: - __intrinsic_asm "dfd$(xOrY)"; - case spirv: - return spirv_asm {OpDPd$(xOrY) $$T result $x}; - case wgsl: - __intrinsic_asm "dpd$(xOrY)"; - } -} +// Unsigned->float cases: -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, fragmentprocessing)] -vector dd$(xOrY)(vector x) -{ - __requireComputeDerivative(); - __target_switch - { - case hlsl: - case cpp: - case cuda: - __intrinsic_asm "dd$(xOrY)"; - case glsl: - __intrinsic_asm "dFd$(xOrY)"; - case metal: - __intrinsic_asm "dfd$(xOrY)"; - case spirv: - return spirv_asm {OpDPd$(xOrY) $$vector result $x}; - case wgsl: - __intrinsic_asm "dpd$(xOrY)"; +[__readNone] +[require(cuda_glsl_hlsl_spirv, shader5_sm_5_0)] +float16_t asfloat16(uint16_t value) +{ + __target_switch + { + case cuda: __intrinsic_asm "__ushort_as_half"; + case glsl: __intrinsic_asm "float16_t(unpackHalf2x16($0).x)"; + case hlsl: __intrinsic_asm "asfloat16"; + case spirv: return spirv_asm { + OpBitcast $$float16_t result $value + }; } } -__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, fragmentprocessing)] -matrix dd$(xOrY)(matrix x) +[require(cuda_glsl_hlsl_spirv, shader5_sm_5_0)] +vector asfloat16(vector value) { - __requireComputeDerivative(); __target_switch { - case hlsl: - __intrinsic_asm "dd$(xOrY)"; + case hlsl: __intrinsic_asm "asfloat16"; + case spirv: return spirv_asm { + result:$$vector = OpBitcast $value + }; default: - MATRIX_MAP_UNARY(T, N, M, dd$(xOrY), x); + VECTOR_MAP_UNARY(float16_t, N, asfloat16, value); } } -/// @category derivative -__generic -__glsl_extension(GL_ARB_derivative_control) [__readNone] -[require(glsl_hlsl_spirv, fragmentprocessing_derivativecontrol)] -T dd$(xOrY)_coarse(T x) +matrix asfloat16(matrix value) +{ MATRIX_MAP_UNARY(float16_t, R, C, asfloat16, value); } + +// Float<->signed cases: + +[__unsafeForceInlineEarly] +[__readNone] +[require(cuda_hlsl_metal_spirv, shader5_sm_5_0)] +int16_t asint16(float16_t value) { - __requireComputeDerivative(); __target_switch { - case hlsl: __intrinsic_asm "dd$(xOrY)_coarse"; - case glsl: __intrinsic_asm "dFd$(xOrY)Coarse"; - case spirv: return spirv_asm {OpCapability DerivativeControl; result:$$T = OpDPd$(xOrY)Coarse $x}; + case cuda: __intrinsic_asm "__half_as_short"; + case hlsl: __intrinsic_asm "asint16"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; + case spirv: return spirv_asm { + OpBitcast $$int16_t result $value + }; + default: return asuint16(value); } } -__generic -__glsl_extension(GL_ARB_derivative_control) -[__readNone] -[require(glsl_hlsl_spirv, fragmentprocessing_derivativecontrol)] -vector dd$(xOrY)_coarse(vector x) +[__unsafeForceInlineEarly] +[__readNone] +[require(cuda_hlsl_metal_spirv, shader5_sm_5_0)] +vector asint16(vector value) { - __requireComputeDerivative(); __target_switch { - case hlsl: __intrinsic_asm "dd$(xOrY)_coarse"; - case glsl: __intrinsic_asm "dFd$(xOrY)Coarse"; - case spirv: return spirv_asm {OpCapability DerivativeControl; result:$$vector = OpDPd$(xOrY)Coarse $x}; + case hlsl: __intrinsic_asm "asint16"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; + default: return asuint16(value); } } -__generic +[__unsafeForceInlineEarly] [__readNone] -[require(glsl_hlsl_spirv, fragmentprocessing_derivativecontrol)] -matrix dd$(xOrY)_coarse(matrix x) +[require(cuda_hlsl_spirv, shader5_sm_5_0)] +matrix asint16(matrix value) { - __requireComputeDerivative(); __target_switch { - case hlsl: - __intrinsic_asm "dd$(xOrY)_coarse"; - default: - MATRIX_MAP_UNARY(T, N, M, dd$(xOrY)_coarse, x); + case hlsl: __intrinsic_asm "asint16"; + default: return asuint16(value); } } -/// @category derivative -__generic -__glsl_extension(GL_ARB_derivative_control) [__readNone] -[require(glsl_hlsl_spirv, fragmentprocessing_derivativecontrol)] -T dd$(xOrY)_fine(T x) +[__unsafeForceInlineEarly] +[require(cuda_hlsl_metal_spirv, shader5_sm_5_0)] +float16_t asfloat16(int16_t value) { - __requireComputeDerivative(); __target_switch { - case hlsl: __intrinsic_asm "dd$(xOrY)_fine"; - case glsl: __intrinsic_asm "dFd$(xOrY)Fine"; - case spirv: return spirv_asm {OpCapability DerivativeControl; result:$$T = OpDPd$(xOrY)Fine $x}; + case cuda: __intrinsic_asm "__short_as_half"; + case hlsl: __intrinsic_asm "asfloat16"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; + case spirv: return spirv_asm { + OpBitcast $$float16_t result $value + }; + default: return asfloat16(asuint16(value)); } } -__generic -__glsl_extension(GL_ARB_derivative_control) +[__unsafeForceInlineEarly] [__readNone] -[require(glsl_hlsl_spirv, fragmentprocessing_derivativecontrol)] -vector dd$(xOrY)_fine(vector x) +[require(cuda_hlsl_metal_spirv, shader5_sm_5_0)] +vector asfloat16(vector value) { - __requireComputeDerivative(); __target_switch { - case hlsl: __intrinsic_asm "dd$(xOrY)_fine"; - case glsl: __intrinsic_asm "dFd$(xOrY)Fine"; - case spirv: return spirv_asm {OpCapability DerivativeControl; result:$$vector = OpDPd$(xOrY)Fine $x}; + case hlsl: __intrinsic_asm "asfloat16"; + case metal: __intrinsic_asm "as_type<$TR>($0)"; + case spirv: return spirv_asm { + OpBitcast $$vector result $value + }; + default: return asfloat16(asuint16(value)); } } -__generic +[__unsafeForceInlineEarly] [__readNone] -[require(glsl_hlsl_spirv, fragmentprocessing_derivativecontrol)] -matrix dd$(xOrY)_fine(matrix x) +[require(cuda_hlsl_spirv, shader5_sm_5_0)] +matrix asfloat16(matrix value) { - __requireComputeDerivative(); __target_switch { - case hlsl: - __intrinsic_asm "dd$(xOrY)_fine"; - default: - MATRIX_MAP_UNARY(T, N, M, dd$(xOrY)_fine, x); + case hlsl: __intrinsic_asm "asfloat16"; + default: return asfloat16(asuint16(value)); } } -${{{{ -} // for (xOrY) -}}}} - - -/// Convert radians to degrees. +/// Inverse tangent (HLSL SM 1.0). /// @category math __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl)] -T degrees(T x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T atan(T x) { __target_switch { - case glsl: __intrinsic_asm "degrees"; - case hlsl: __intrinsic_asm "degrees"; + case cpp: __intrinsic_asm "$P_atan($0)"; + case cuda: __intrinsic_asm "$P_atan($0)"; + case glsl: __intrinsic_asm "atan"; + case hlsl: __intrinsic_asm "atan"; + case metal: __intrinsic_asm "atan"; case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Degrees $x + OpExtInst $$T result glsl450 Atan $x }; - case wgsl: __intrinsic_asm "degrees"; - default: - return x * (T(180) / T.getPi()); + case wgsl: __intrinsic_asm "atan"; } } __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl)] -vector degrees(vector x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector atan(vector x) { __target_switch { - case glsl: __intrinsic_asm "degrees"; - case hlsl: __intrinsic_asm "degrees"; + case glsl: __intrinsic_asm "atan"; + case hlsl: __intrinsic_asm "atan"; + case metal: __intrinsic_asm "atan"; case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Degrees $x + OpExtInst $$vector result glsl450 Atan $x }; - case wgsl: __intrinsic_asm "degrees"; + case wgsl: __intrinsic_asm "atan"; default: - VECTOR_MAP_UNARY(T, N, degrees, x); + VECTOR_MAP_UNARY(T, N, atan, x); } } __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl)] -matrix degrees(matrix x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +matrix atan(matrix x) { __target_switch { - case hlsl: __intrinsic_asm "degrees"; + case hlsl: __intrinsic_asm "atan"; default: - MATRIX_MAP_UNARY(T, N, M, degrees, x); + MATRIX_MAP_UNARY(T, N, M, atan, x); } } -/// Compute matrix determinant. +/// Inverse tangent (HLSL SM 1.0). /// @category math +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T atan2(T y, T x) +{ + __target_switch + { + case cpp: __intrinsic_asm "$P_atan2($0, $1)"; + case cuda: __intrinsic_asm "$P_atan2($0, $1)"; + case glsl: __intrinsic_asm "atan($0,$1)"; + case hlsl: __intrinsic_asm "atan2"; + case metal: __intrinsic_asm "atan2"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Atan2 $y $x + }; + case wgsl: __intrinsic_asm "atan2"; + } +} + __generic [__readNone] -[PreferCheckpoint] -[require(glsl_hlsl_metal_spirv_wgsl)] -T determinant(matrix m) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector atan2(vector y, vector x) { __target_switch { - case glsl: __intrinsic_asm "determinant"; - case hlsl: __intrinsic_asm "determinant"; - case metal: __intrinsic_asm "determinant"; + case glsl: __intrinsic_asm "atan($0,$1)"; + case hlsl: __intrinsic_asm "atan2"; + case metal: __intrinsic_asm "atan2"; case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Determinant $m + OpExtInst $$vector result glsl450 Atan2 $y $x }; - case wgsl: __intrinsic_asm "determinant"; + case wgsl: __intrinsic_asm "atan2"; + default: + VECTOR_MAP_BINARY(T, N, atan2, y, x); } } -/// Barrier for device memory. -/// @category barrier -__glsl_extension(GL_KHR_memory_scope_semantics) -[require(cuda_glsl_hlsl_metal_spirv_wgsl, memorybarrier)] -void DeviceMemoryBarrier() +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +matrix atan2(matrix y, matrix x) { __target_switch { - case hlsl: __intrinsic_asm "DeviceMemoryBarrier"; - case glsl: __intrinsic_asm "memoryBarrier(gl_ScopeDevice, (gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)"; - case cuda: __intrinsic_asm "__threadfence()"; - case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture | mem_flags::mem_threadgroup_imageblock)"; - case spirv: spirv_asm - { - OpMemoryBarrier Device AcquireRelease|UniformMemory|ImageMemory; - }; - case wgsl: __intrinsic_asm "storageBarrier(); textureBarrier(); workgroupBarrier();"; + case hlsl: __intrinsic_asm "atan2"; + default: + MATRIX_MAP_BINARY(T, N, M, atan2, y, x); } } -/// @category barrier -/// Barrier for device memory with group synchronization. -__glsl_extension(GL_KHR_memory_scope_semantics) -[require(cuda_glsl_hlsl_metal_spirv_wgsl, memorybarrier)] -void DeviceMemoryBarrierWithGroupSync() +/// Hyperbolic inverse tangent +/// @category math +__generic +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T atanh(T x) { __target_switch { - case hlsl: __intrinsic_asm "DeviceMemoryBarrierWithGroupSync"; - case glsl: __intrinsic_asm "controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, (gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)"; - case cuda: __intrinsic_asm "__syncthreads()"; - case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture | mem_flags::mem_threadgroup_imageblock)"; - case spirv: spirv_asm - { - OpControlBarrier Workgroup Device AcquireRelease|UniformMemory|ImageMemory; - }; - case wgsl: __intrinsic_asm "storageBarrier(); textureBarrier(); workgroupBarrier();"; + case cpp: __intrinsic_asm "$P_atanh($0)"; + case cuda: __intrinsic_asm "$P_atanh($0)"; + case glsl: __intrinsic_asm "atanh"; + case metal: __intrinsic_asm "atanh"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Atanh $x + }; + case wgsl: __intrinsic_asm "atanh"; + default: + return T(0.5) * log((T(1) + x) / (T(1) - x)); } } -// Vector distance -/// @category math -__generic +__generic [__readNone] +[ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T distance(vector x, vector y) +vector atanh(vector x) { __target_switch { - case glsl: __intrinsic_asm "distance"; - case hlsl: __intrinsic_asm "distance"; - case metal: __intrinsic_asm "distance"; + case glsl: __intrinsic_asm "atanh"; + case metal: __intrinsic_asm "atanh"; case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Distance $x $y + OpExtInst $$vector result glsl450 Atanh $x }; - case wgsl: __intrinsic_asm "distance"; + case wgsl: __intrinsic_asm "atanh"; default: - return length(x - y); + VECTOR_MAP_UNARY(T, N, atanh, x); } } +/// Ceiling (HLSL SM 1.0). +/// @category math __generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T distance(T x, T y) +T ceil(T x) { __target_switch { - case glsl: __intrinsic_asm "distance"; + case cpp: __intrinsic_asm "$P_ceil($0)"; + case cuda: __intrinsic_asm "$P_ceil($0)"; + case glsl: __intrinsic_asm "ceil"; + case hlsl: __intrinsic_asm "ceil"; + case metal: __intrinsic_asm "ceil"; case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Distance $x $y + OpExtInst $$T result glsl450 Ceil $x }; - case wgsl: __intrinsic_asm "distance"; - default: - return length(x - y); + case wgsl: __intrinsic_asm "ceil"; } } -/// Computes `max(0, x-y)`. -/// @category math -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_4_0_version)] -T fdim(T x, T y) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector ceil(vector x) { __target_switch { - case metal: __intrinsic_asm "fdim"; + case glsl: __intrinsic_asm "ceil"; + case hlsl: __intrinsic_asm "ceil"; + case metal: __intrinsic_asm "ceil"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Ceil $x + }; + case wgsl: __intrinsic_asm "ceil"; default: - return max(T(0), x - y); + VECTOR_MAP_UNARY(T, N, ceil, x); } } -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_4_0_version)] -vector fdim(vector x, vector y) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +matrix ceil(matrix x) { __target_switch { - case metal: __intrinsic_asm "fdim"; + case hlsl: __intrinsic_asm "ceil"; default: - return max(T(0), x - y); + MATRIX_MAP_UNARY(T, N, M, ceil, x); } } -// divide +// Copy-sign /// @category math -__generic +__generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv)] -T divide(T x, T y) +vector copysign_half(vector x, vector y) +{ + let ux = reinterpret>(x); + let uy = reinterpret>(y); + vector signY = (uy & (uint16_t(1) << uint16_t(15))); + vector newX = (ux & ((uint16_t(1) << uint16_t(15)) - uint16_t(1))) + signY; + return reinterpret>(newX); +} + +/// @category math +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv)] +vector copysign_float(vector x, vector y) +{ + let ux = reinterpret>(x); + let uy = reinterpret>(y); + vector signY = (uy & (uint32_t(1) << uint32_t(31))); + vector newX = (ux & ((uint32_t(1) << uint32_t(31)) - uint32_t(1))) + signY; + return reinterpret>(newX); +} + +/// @category math +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv)] +vector copysign_double(vector x, vector y) +{ + let ux = reinterpret>(x); + let uy = reinterpret>(y); + vector signY = (uy & (uint64_t(1) << uint64_t(63))); + vector newX = (ux & ((uint64_t(1) << uint64_t(63)) - uint64_t(1))) + signY; + return reinterpret>(newX); +} + +__generic +__intrinsic_op($(kIROp_FloatCast)) +vector __real_cast(vector val); + +/// @category math +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv)] +vector copysign(vector x, vector y) { __target_switch { - case metal: __intrinsic_asm "divide"; + case metal: __intrinsic_asm "copysign"; default: - return x / y; + { + // sign of -0.0 needs to be respected. + if (T is half) + return __real_cast(copysign_half( + __real_cast(x), + __real_cast(y))); + if (T is float) + return __real_cast(copysign_float( + __real_cast(x), + __real_cast(y))); + return __real_cast(copysign_double( + __real_cast(x), + __real_cast(y))); + } } } -__generic +__generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv)] -vector divide(vector x, vector y) +T copysign(T x, T y) { __target_switch { - case metal: __intrinsic_asm "divide"; + case metal: __intrinsic_asm "copysign"; default: - return x / y; + return copysign(vector(x), vector(y))[0]; } } -/// Vector dot product + +// Check access status to tiled resource +[ForceInline] +[require(hlsl, sm_5_0)] +bool CheckAccessFullyMapped(out uint status) +{ + __target_switch + { + case hlsl: __intrinsic_asm "CheckAccessFullyMapped"; + } +} + +/// Clamp (HLSL SM 1.0). /// @category math -__generic +__generic [__readNone] -[ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T dot(T x, T y) +T clamp(T x, T minBound, T maxBound) { __target_switch { - case glsl: __intrinsic_asm "dot"; - case hlsl: __intrinsic_asm "dot"; - case wgsl: __intrinsic_asm "dot"; + case hlsl: __intrinsic_asm "clamp"; + case glsl: __intrinsic_asm "clamp"; + case metal: __intrinsic_asm "clamp"; + case spirv: + if (__isSignedInt()) + return spirv_asm { + result:$$T = OpExtInst glsl450 SClamp $x $minBound $maxBound + }; + else + return spirv_asm { + result:$$T = OpExtInst glsl450 UClamp $x $minBound $maxBound + }; + case wgsl: __intrinsic_asm "clamp"; default: - return x * y; + return min(max(x, minBound), maxBound); } } -__generic +__generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T dot(vector x, vector y) +vector clamp(vector x, vector minBound, vector maxBound) { __target_switch { - case glsl: __intrinsic_asm "dot"; - case hlsl: __intrinsic_asm "dot"; - case metal: __intrinsic_asm "dot"; - case spirv: return spirv_asm { - OpDot $$T result $x $y - }; - case wgsl: __intrinsic_asm "dot"; + case hlsl: __intrinsic_asm "clamp"; + case glsl: __intrinsic_asm "clamp"; + case metal: __intrinsic_asm "clamp"; + case spirv: + if (__isSignedInt()) + return spirv_asm { + result:$$vector = OpExtInst glsl450 SClamp $x $minBound $maxBound + }; + else + return spirv_asm { + result:$$vector = OpExtInst glsl450 UClamp $x $minBound $maxBound + }; + case wgsl: __intrinsic_asm "clamp"; default: - T result = T(0); - for(int i = 0; i < N; ++i) - result += x[i] * y[i]; - return result; + return min(max(x, minBound), maxBound); } } -__generic +__generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T dot(vector x, vector y) +matrix clamp(matrix x, matrix minBound, matrix maxBound) { __target_switch { - case hlsl: __intrinsic_asm "dot"; - case wgsl: __intrinsic_asm "dot"; + case hlsl: __intrinsic_asm "clamp"; default: - T result = T(0); - for(int i = 0; i < N; ++i) - result += x[i] * y[i]; - return result; + return min(max(x, minBound), maxBound); } } -/// Helper for computing distance terms for lighting (obsolete) -/// @category math -/// @deprecated -__generic vector dst(vector x, vector y); - -// Given a RWByteAddressBuffer allow it to be interpreted as a RWStructuredBuffer -__intrinsic_op($(kIROp_GetEquivalentStructuredBuffer)) -RWStructuredBuffer __getEquivalentStructuredBuffer(RWByteAddressBuffer b); - -__intrinsic_op($(kIROp_GetEquivalentStructuredBuffer)) -StructuredBuffer __getEquivalentStructuredBuffer(ByteAddressBuffer b); - -__intrinsic_op($(kIROp_GetEquivalentStructuredBuffer)) -RasterizerOrderedStructuredBuffer __getEquivalentStructuredBuffer(RasterizerOrderedByteAddressBuffer b); - -// Error message - -// void errorf( string format, ... ); - -// Attribute evaluation - -// TODO: The matrix cases of these functions won't actuall work -// when compiled to GLSL, since they only support scalar/vector - -// TODO: Should these be constrains to `__BuiltinFloatingPointType`? -// TODO: SPIRV-direct does not support non-floating-point types. - -__generic +__generic [__readNone] -[require(glsl_spirv, fragmentprocessing)] -T EvaluateAttributeAtCentroid(T x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T clamp(T x, T minBound, T maxBound) { __target_switch { - case glsl: __intrinsic_asm "interpolateAtCentroid"; + case hlsl: __intrinsic_asm "clamp"; + case glsl: __intrinsic_asm "clamp"; + case metal: __intrinsic_asm "clamp"; case spirv: return spirv_asm { - OpExtInst $$T result glsl450 InterpolateAtCentroid $x + result:$$T = OpExtInst glsl450 FClamp $x $minBound $maxBound }; + case wgsl: __intrinsic_asm "clamp"; + default: + return min(max(x, minBound), maxBound); } } -__generic +__generic [__readNone] -[require(glsl_spirv, fragmentprocessing)] -vector EvaluateAttributeAtCentroid(vector x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector clamp(vector x, vector minBound, vector maxBound) { __target_switch { - case glsl: __intrinsic_asm "interpolateAtCentroid"; + case hlsl: __intrinsic_asm "clamp"; + case glsl: __intrinsic_asm "clamp"; + case metal: __intrinsic_asm "clamp"; case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 InterpolateAtCentroid $x + result:$$vector = OpExtInst glsl450 FClamp $x $minBound $maxBound }; + case wgsl: __intrinsic_asm "clamp"; + default: + return min(max(x, minBound), maxBound); } } -__generic +__generic [__readNone] -[require(glsl_spirv, fragmentprocessing)] -matrix EvaluateAttributeAtCentroid(matrix x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +matrix clamp(matrix x, matrix minBound, matrix maxBound) { __target_switch { - case glsl: __intrinsic_asm "interpolateAtCentroid"; + case hlsl: __intrinsic_asm "clamp"; default: - MATRIX_MAP_UNARY(T, N, M, EvaluateAttributeAtCentroid, x); + return min(max(x, minBound), maxBound); } } -__generic -[__readNone] -[require(glsl_spirv, fragmentprocessing)] -T EvaluateAttributeAtSample(T x, uint sampleindex) +/// Clip (discard) fragment conditionally +__generic +[require(cpp_cuda_glsl_hlsl_spirv, fragment)] +void clip(T x) { __target_switch { - case glsl: __intrinsic_asm "interpolateAtSample($0, int($1))"; - case spirv: return spirv_asm { - OpExtInst $$T result glsl450 InterpolateAtSample $x $sampleindex - }; + case hlsl: __intrinsic_asm "clip"; + default: + if(x < T(0)) discard; } } -__generic -[__readNone] -[require(glsl_spirv, fragmentprocessing)] -vector EvaluateAttributeAtSample(vector x, uint sampleindex) +__generic +[require(cpp_cuda_glsl_hlsl_spirv, fragment)] +void clip(vector x) { __target_switch { - case glsl: __intrinsic_asm "interpolateAtSample($0, int($1))"; - case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 InterpolateAtSample $x $sampleindex - }; + case hlsl: __intrinsic_asm "clip"; + default: + if(any(x < T(0))) discard; } } -__generic -[__readNone] -[require(glsl_spirv, fragmentprocessing)] -matrix EvaluateAttributeAtSample(matrix x, uint sampleindex) +__generic +[require(cpp_cuda_glsl_hlsl_spirv, fragment)] +void clip(matrix x) { __target_switch { - case glsl: __intrinsic_asm "interpolateAtSample($0, int($1))"; + case hlsl: __intrinsic_asm "clip"; default: - matrix result; - for(int i = 0; i < N; ++i) - { - result[i] = EvaluateAttributeAtSample(x[i], sampleindex); - } - return result; + if(any(x < T(0))) discard; } } -__generic +/// @category math +__generic [__readNone] -[require(glsl_spirv, fragmentprocessing)] -T EvaluateAttributeSnapped(T x, int2 offset) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T cos(T x) { __target_switch { - case glsl: __intrinsic_asm "interpolateAtOffset($0, vec2($1) / 16.0f)"; - case spirv: - { - const float2 tmp = float2(16.f, 16.f); - return spirv_asm { - %foffset:$$float2 = OpConvertSToF $offset; - %offsetdiv16:$$float2 = OpFDiv %foffset $tmp; - result:$$T = OpExtInst glsl450 InterpolateAtOffset $x %offsetdiv16 - }; - } + case cpp: __intrinsic_asm "$P_cos($0)"; + case cuda: __intrinsic_asm "$P_cos($0)"; + case glsl: __intrinsic_asm "cos"; + case hlsl: __intrinsic_asm "cos"; + case metal: __intrinsic_asm "cos"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Cos $x + }; + case wgsl: __intrinsic_asm "cos"; } } -__generic +__generic [__readNone] -[require(glsl_spirv, fragmentprocessing)] -vector EvaluateAttributeSnapped(vector x, int2 offset) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector cos(vector x) { __target_switch { - case glsl: __intrinsic_asm "interpolateAtOffset($0, vec2($1) / 16.0f)"; - case spirv: - { - const float2 tmp = float2(16.f, 16.f); - return spirv_asm { - %foffset:$$float2 = OpConvertSToF $offset; - %offsetdiv16:$$float2 = OpFDiv %foffset $tmp; - result:$$vector = OpExtInst glsl450 InterpolateAtOffset $x %offsetdiv16 - }; - } + case glsl: __intrinsic_asm "cos"; + case hlsl: __intrinsic_asm "cos"; + case metal: __intrinsic_asm "cos"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Cos $x + }; + case wgsl: __intrinsic_asm "cos"; + default: + VECTOR_MAP_UNARY(T,N, cos, x); } } -__generic +__generic [__readNone] -[require(glsl_spirv, fragmentprocessing)] -matrix EvaluateAttributeSnapped(matrix x, int2 offset) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +matrix cos(matrix x) { __target_switch { - case glsl: __intrinsic_asm "interpolateAtOffset($0, vec2($1) / 16.0f)"; + case hlsl: __intrinsic_asm "cos"; default: - matrix result; - for(int i = 0; i < N; ++i) - { - result[i] = EvaluateAttributeSnapped(x[i], offset); - } - return result; + MATRIX_MAP_UNARY(T, N, M, cos, x); } } -/// Computes base-e exponent. +/// Hyperbolic cosine. /// @category math __generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T exp(T x) +T cosh(T x) { __target_switch { - case cpp: __intrinsic_asm "$P_exp($0)"; - case cuda: __intrinsic_asm "$P_exp($0)"; - case glsl: __intrinsic_asm "exp"; - case hlsl: __intrinsic_asm "exp"; - case metal: __intrinsic_asm "exp"; + case cpp: __intrinsic_asm "$P_cosh($0)"; + case cuda: __intrinsic_asm "$P_cosh($0)"; + case glsl: __intrinsic_asm "cosh"; + case hlsl: __intrinsic_asm "cosh"; + case metal: __intrinsic_asm "cosh"; case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Exp $x + OpExtInst $$T result glsl450 Cosh $x }; - case wgsl: __intrinsic_asm "exp"; + case wgsl: __intrinsic_asm "cosh"; } } -/// @category math __generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector exp(vector x) +vector cosh(vector x) { __target_switch { - case glsl: __intrinsic_asm "exp"; - case hlsl: __intrinsic_asm "exp"; - case metal: __intrinsic_asm "exp"; + case glsl: __intrinsic_asm "cosh"; + case hlsl: __intrinsic_asm "cosh"; + case metal: __intrinsic_asm "cosh"; case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Exp $x + OpExtInst $$vector result glsl450 Cosh $x }; - case wgsl: __intrinsic_asm "exp"; + case wgsl: __intrinsic_asm "cosh"; default: - VECTOR_MAP_UNARY(T, N, exp, x); + VECTOR_MAP_UNARY(T,N, cosh, x); } } -/// @category math __generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix exp(matrix x) +matrix cosh(matrix x) { __target_switch { - case hlsl: __intrinsic_asm "exp"; + case hlsl: __intrinsic_asm "cosh"; default: - MATRIX_MAP_UNARY(T, N, M, exp, x); + MATRIX_MAP_UNARY(T, N, M, cosh, x); } } -/// Computes base-2 exponent +/// Compute the cosine of an angle in degrees. /// @category math __generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T exp2(T x) +T cospi(T x) { __target_switch { - case glsl: - __intrinsic_asm "exp2($0)"; - case spirv: - if (__isHalf()) - { - return spirv_asm { OpExtInst $$T result glsl450 Exp2 $x }; - } - else - { - float xf = __realCast(x); - return T(spirv_asm { - result:$$float = OpExtInst glsl450 Exp2 $xf - }); - } - case hlsl: - __intrinsic_asm "exp2($0)"; - case metal: __intrinsic_asm "exp2"; - case cpp: - __intrinsic_asm "$P_exp2($0)"; - case cuda: - __intrinsic_asm "$P_exp2($0)"; - case wgsl: - __intrinsic_asm "exp2"; + case metal: __intrinsic_asm "cospi"; + default: + return cos(T.getPi() * x); } - } -__generic +__generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector exp2(vector x) +vector cospi(vector x) { __target_switch { - case glsl: - __intrinsic_asm "exp2($0)"; - case hlsl: __intrinsic_asm "exp2"; - case metal: __intrinsic_asm "exp2"; - case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Exp2 $x - }; - case wgsl: __intrinsic_asm "exp2"; + case metal: __intrinsic_asm "cospi"; default: - VECTOR_MAP_UNARY(T, N, exp2, x); + return cos(T.getPi() * x); + } +} + + +/// Population count. +/// @category bitops +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint countbits(uint value) +{ + __target_switch + { + case hlsl: + __intrinsic_asm "countbits"; + case glsl: + __intrinsic_asm "bitCount"; + case metal: + __intrinsic_asm "popcount"; + case cuda: + case cpp: + __intrinsic_asm "$P_countbits($0)"; + case spirv: + return spirv_asm {OpBitCount $$uint result $value}; + case wgsl: + __intrinsic_asm "countOneBits"; } } -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix exp2(matrix x) +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +vector countbits(vector value) { __target_switch { - case hlsl: __intrinsic_asm "exp2"; + case hlsl: + __intrinsic_asm "countbits"; + case glsl: + __intrinsic_asm "bitCount"; + case metal: + __intrinsic_asm "popcount"; + case spirv: + return spirv_asm {OpBitCount $$vector result $value}; + case wgsl: + __intrinsic_asm "countOneBits"; default: - MATRIX_MAP_UNARY(T, N, M, exp2, x); + VECTOR_MAP_UNARY(uint, N, countbits, value); } } -/// Computes base-10 exponent +/// Cross product /// @category math __generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T exp10(T x) +vector cross(vector left, vector right) { + // TODO: SPIRV does not support integer vectors. __target_switch { - case metal: __intrinsic_asm "exp10"; + case glsl: __intrinsic_asm "cross"; + case hlsl: __intrinsic_asm "cross"; + case metal: __intrinsic_asm "cross"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Cross $left $right + }; + case wgsl: __intrinsic_asm "cross"; default: - const T ln10 = T(2.302585092994045901); // ln(10) - return exp(x * ln10); + return vector( + left.y * right.z - left.z * right.y, + left.z * right.x - left.x * right.z, + left.x * right.y - left.y * right.x); } } -__generic +__generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector exp10(vector x) +vector cross(vector left, vector right) { __target_switch { - case metal: __intrinsic_asm "exp10"; + case glsl: __intrinsic_asm "cross"; + case hlsl: __intrinsic_asm "cross"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Cross $left $right + }; + case wgsl: __intrinsic_asm "cross"; default: - const T ln10 = T(2.30258509299); // ln(10) - return exp(x * ln10); - } -} - - -/// Convert 16-bit float stored in low bits of integer -/// @category conversion Conversion functions -__glsl_version(420) -__cuda_sm_version(6.0) -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -float f16tof32(uint value) -{ - __target_switch - { - case glsl: __intrinsic_asm "unpackHalf2x16($0).x"; - case hlsl: __intrinsic_asm "f16tof32($0)"; - case cuda: __intrinsic_asm "__half2float(__ushort_as_half($0))"; - case cpp: __intrinsic_asm "f16tof32($0)"; - case metal: __intrinsic_asm "as_type((ushort)($0))"; - case spirv: - { - return spirv_asm { - %lowBits = OpUConvert $$uint16_t $value; - %half = OpBitcast $$half %lowBits; - result:$$float = OpFConvert %half - }; - } - case wgsl: __intrinsic_asm "unpack2x16float($0).x"; + return vector( + left.y * right.z - left.z * right.y, + left.z * right.x - left.x * right.z, + left.x * right.y - left.y * right.x); } } -__generic +// Convert encoded color [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -vector f16tof32(vector value) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +int4 D3DCOLORtoUBYTE4(float4 color) { __target_switch { - case hlsl: __intrinsic_asm "f16tof32"; - case spirv: - { - return spirv_asm { - %lowBits = OpUConvert $$vector $value; - %half = OpBitcast $$vector %lowBits; - result:$$vector = OpFConvert %half - }; - } + case hlsl: __intrinsic_asm "D3DCOLORtoUBYTE4"; + case wgsl: __intrinsic_asm "bitcast(pack4x8unorm($0)).zyxw"; default: - VECTOR_MAP_UNARY(float, N, f16tof32, value); + let scaled = color.zyxw * 255.001999f; + return int4(scaled); } } -/// Convert to 16-bit float stored in low bits of integer. -/// @category conversion -__glsl_version(420) -__cuda_sm_version(6.0) +// Partial-difference derivatives +${{{{ +const char* diffDimensions[2] = {"x", "y"}; +for (auto xOrY : diffDimensions) { +}}}} +/// @category derivative Derivative functions +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -uint f32tof16(float value) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, fragmentprocessing)] +T dd$(xOrY)(T x) { + __requireComputeDerivative(); __target_switch { - case glsl: __intrinsic_asm "packHalf2x16(vec2($0,0.0))"; - case hlsl: __intrinsic_asm "f32tof16($0)"; - case cuda: __intrinsic_asm "__half_as_ushort(__float2half($0))"; - case cpp: __intrinsic_asm "f32tof16($0)"; - case metal: __intrinsic_asm "as_type((half)($0))"; + case hlsl: + case cpp: + case cuda: + __intrinsic_asm "dd$(xOrY)"; + case glsl: + __intrinsic_asm "dFd$(xOrY)"; + case metal: + __intrinsic_asm "dfd$(xOrY)"; case spirv: - { - return spirv_asm { - %half = OpFConvert $$half $value; - %lowBits = OpBitcast $$uint16_t %half; - result:$$uint = OpUConvert %lowBits - }; - } - case wgsl: __intrinsic_asm "pack2x16float(vec2f($0,0.0))"; + return spirv_asm {OpDPd$(xOrY) $$T result $x}; + case wgsl: + __intrinsic_asm "dpd$(xOrY)"; } } -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -vector f32tof16(vector value) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, fragmentprocessing)] +vector dd$(xOrY)(vector x) { + __requireComputeDerivative(); __target_switch { - case hlsl: __intrinsic_asm "f32tof16"; + case hlsl: + case cpp: + case cuda: + __intrinsic_asm "dd$(xOrY)"; + case glsl: + __intrinsic_asm "dFd$(xOrY)"; + case metal: + __intrinsic_asm "dfd$(xOrY)"; case spirv: - { - return spirv_asm { - %half = OpFConvert $$vector $value; - %lowBits = OpBitcast $$vector %half; - result:$$vector = OpUConvert %lowBits - }; - } - default: - VECTOR_MAP_UNARY(uint, N, f32tof16, value); + return spirv_asm {OpDPd$(xOrY) $$vector result $x}; + case wgsl: + __intrinsic_asm "dpd$(xOrY)"; } } -// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -// The following is Slang specific and NOT part of standard HLSL -// It's not clear what happens with float16 time in HLSL -> can the float16 coerce to uint for example? If so that would -// give the wrong result - -__glsl_version(420) +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -float f16tof32(float16_t value) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, fragmentprocessing)] +matrix dd$(xOrY)(matrix x) { + __requireComputeDerivative(); __target_switch { - case glsl: __intrinsic_asm "unpackHalf2x16($0).x"; - case hlsl: __intrinsic_asm "f16tof32($0)"; - case cuda: __intrinsic_asm "__half2float($0)"; - case cpp: __intrinsic_asm "f16tof32($0)"; - case metal: __intrinsic_asm "float($0)"; - case spirv: - { - return spirv_asm { - result:$$float = OpFConvert $value - }; - } - case wgsl: __intrinsic_asm "f32($0)"; + case hlsl: + __intrinsic_asm "dd$(xOrY)"; + default: + MATRIX_MAP_UNARY(T, N, M, dd$(xOrY), x); } } -__generic +/// @category derivative +__generic +__glsl_extension(GL_ARB_derivative_control) [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -vector f16tof32(vector value) +[require(glsl_hlsl_spirv, fragmentprocessing_derivativecontrol)] +T dd$(xOrY)_coarse(T x) { + __requireComputeDerivative(); __target_switch { - case cuda: __intrinsic_asm "__half2float"; - case hlsl: __intrinsic_asm "f16tof32"; - case metal: __intrinsic_asm "$TR($0)"; - case spirv: return spirv_asm { - OpFConvert $$vector result $value - }; - default: - VECTOR_MAP_UNARY(float, N, f16tof32, value); + case hlsl: __intrinsic_asm "dd$(xOrY)_coarse"; + case glsl: __intrinsic_asm "dFd$(xOrY)Coarse"; + case spirv: return spirv_asm {OpCapability DerivativeControl; result:$$T = OpDPd$(xOrY)Coarse $x}; } } -/// Convert to float16_t. -/// @category conversion -__glsl_version(420) +__generic +__glsl_extension(GL_ARB_derivative_control) [__readNone] -[require(cuda_glsl_metal_spirv_wgsl, shader5_sm_5_0)] -float16_t f32tof16_(float value) +[require(glsl_hlsl_spirv, fragmentprocessing_derivativecontrol)] +vector dd$(xOrY)_coarse(vector x) { + __requireComputeDerivative(); __target_switch { - case cuda: __intrinsic_asm "__float2half"; - case glsl: __intrinsic_asm "packHalf2x16(vec2($0,0.0))"; - case metal: __intrinsic_asm "half($0)"; - case spirv: return spirv_asm { - OpFConvert $$float16_t result $value - }; - case wgsl: __intrinsic_asm "f16($0)"; + case hlsl: __intrinsic_asm "dd$(xOrY)_coarse"; + case glsl: __intrinsic_asm "dFd$(xOrY)Coarse"; + case spirv: return spirv_asm {OpCapability DerivativeControl; result:$$vector = OpDPd$(xOrY)Coarse $x}; } } -__generic +__generic [__readNone] -[require(cuda_glsl_metal_spirv_wgsl, shader5_sm_5_0)] -vector f32tof16_(vector value) +[require(glsl_hlsl_spirv, fragmentprocessing_derivativecontrol)] +matrix dd$(xOrY)_coarse(matrix x) { - __target_switch - { - case cuda: __intrinsic_asm "__float2half"; - case metal: __intrinsic_asm "$TR($0)"; - case spirv: return spirv_asm { - OpFConvert $$vector result $value - }; + __requireComputeDerivative(); + __target_switch + { + case hlsl: + __intrinsic_asm "dd$(xOrY)_coarse"; default: - VECTOR_MAP_UNARY(float16_t, N, f32tof16_, value); + MATRIX_MAP_UNARY(T, N, M, dd$(xOrY)_coarse, x); } } -// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - -/// Flip surface normal to face forward, if needed. -/// @category math -__generic +/// @category derivative +__generic +__glsl_extension(GL_ARB_derivative_control) [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector faceforward(vector n, vector i, vector ng) +[require(glsl_hlsl_spirv, fragmentprocessing_derivativecontrol)] +T dd$(xOrY)_fine(T x) { + __requireComputeDerivative(); __target_switch { - case glsl: __intrinsic_asm "faceforward"; - case hlsl: __intrinsic_asm "faceforward"; - case metal: __intrinsic_asm "faceforward"; - case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 FaceForward $n $i $ng - }; - case wgsl: __intrinsic_asm "faceForward"; - default: - return dot(ng, i) < T(0.0f) ? n : -n; + case hlsl: __intrinsic_asm "dd$(xOrY)_fine"; + case glsl: __intrinsic_asm "dFd$(xOrY)Fine"; + case spirv: return spirv_asm {OpCapability DerivativeControl; result:$$T = OpDPd$(xOrY)Fine $x}; } } -/// Find first set bit starting at high bit and working down. -/// @category bitops Bit operation functions +__generic +__glsl_extension(GL_ARB_derivative_control) [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -int firstbithigh(int value) +[require(glsl_hlsl_spirv, fragmentprocessing_derivativecontrol)] +vector dd$(xOrY)_fine(vector x) { + __requireComputeDerivative(); __target_switch { - case cpp: __intrinsic_asm "$P_firstbithigh($0)"; - case cuda: __intrinsic_asm "$P_firstbithigh($0)"; - case glsl: __intrinsic_asm "findMSB"; - case hlsl: __intrinsic_asm "firstbithigh"; - case metal: __intrinsic_asm "clz"; - case spirv: return spirv_asm { - OpExtInst $$int result glsl450 FindSMsb $value - }; - case wgsl: __intrinsic_asm "firstLeadingBit"; + case hlsl: __intrinsic_asm "dd$(xOrY)_fine"; + case glsl: __intrinsic_asm "dFd$(xOrY)Fine"; + case spirv: return spirv_asm {OpCapability DerivativeControl; result:$$vector = OpDPd$(xOrY)Fine $x}; } } -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -vector firstbithigh(vector value) +[require(glsl_hlsl_spirv, fragmentprocessing_derivativecontrol)] +matrix dd$(xOrY)_fine(matrix x) { + __requireComputeDerivative(); __target_switch { - case glsl: __intrinsic_asm "findMSB"; - case hlsl: __intrinsic_asm "firstbithigh"; - case metal: __intrinsic_asm "clz"; - case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 FindSMsb $value - }; - case wgsl: __intrinsic_asm "firstLeadingBit"; + case hlsl: + __intrinsic_asm "dd$(xOrY)_fine"; default: - VECTOR_MAP_UNARY(int, N, firstbithigh, value); + MATRIX_MAP_UNARY(T, N, M, dd$(xOrY)_fine, x); } } +${{{{ +} // for (xOrY) +}}}} + + +/// Convert radians to degrees. +/// @category math +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -uint firstbithigh(uint value) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl)] +T degrees(T x) { __target_switch { - case cpp: __intrinsic_asm "$P_firstbithigh($0)"; - case cuda: __intrinsic_asm "$P_firstbithigh($0)"; - case glsl: __intrinsic_asm "findMSB"; - case hlsl: __intrinsic_asm "firstbithigh"; - case metal: __intrinsic_asm "clz"; + case glsl: __intrinsic_asm "degrees"; + case hlsl: __intrinsic_asm "degrees"; case spirv: return spirv_asm { - OpExtInst $$uint result glsl450 FindUMsb $value + OpExtInst $$T result glsl450 Degrees $x }; - case wgsl: __intrinsic_asm "firstLeadingBit"; + case wgsl: __intrinsic_asm "degrees"; + default: + return x * (T(180) / T.getPi()); } } -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -vector firstbithigh(vector value) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl)] +vector degrees(vector x) { __target_switch { - case glsl: __intrinsic_asm "findMSB"; - case hlsl: __intrinsic_asm "firstbithigh"; - case metal: __intrinsic_asm "clz"; + case glsl: __intrinsic_asm "degrees"; + case hlsl: __intrinsic_asm "degrees"; case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 FindUMsb $value + OpExtInst $$vector result glsl450 Degrees $x }; - case wgsl: __intrinsic_asm "firstLeadingBit"; + case wgsl: __intrinsic_asm "degrees"; default: - VECTOR_MAP_UNARY(uint, N, firstbithigh, value); + VECTOR_MAP_UNARY(T, N, degrees, x); } } -/// Find first set bit starting at low bit and working up. -/// @category bitops +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -int firstbitlow(int value) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl)] +matrix degrees(matrix x) { __target_switch { - case cpp: __intrinsic_asm "$P_firstbitlow($0)"; - case cuda: __intrinsic_asm "$P_firstbitlow($0)"; - case glsl: __intrinsic_asm "findLSB"; - case hlsl: __intrinsic_asm "firstbitlow"; - case metal: __intrinsic_asm "ctz"; - case spirv: return spirv_asm { - OpExtInst $$int result glsl450 FindILsb $value - }; - case wgsl: __intrinsic_asm "firstTrailingBit"; + case hlsl: __intrinsic_asm "degrees"; + default: + MATRIX_MAP_UNARY(T, N, M, degrees, x); } } -__generic +/// Compute matrix determinant. +/// @category math +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -vector firstbitlow(vector value) +[PreferCheckpoint] +[require(glsl_hlsl_metal_spirv_wgsl)] +T determinant(matrix m) { __target_switch { - case glsl: __intrinsic_asm "findLSB"; - case hlsl: __intrinsic_asm "firstbitlow"; - case metal: __intrinsic_asm "ctz"; + case glsl: __intrinsic_asm "determinant"; + case hlsl: __intrinsic_asm "determinant"; + case metal: __intrinsic_asm "determinant"; case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 FindILsb $value + OpExtInst $$T result glsl450 Determinant $m }; - case wgsl: __intrinsic_asm "firstTrailingBit"; - default: - VECTOR_MAP_UNARY(int, N, firstbitlow, value); + case wgsl: __intrinsic_asm "determinant"; } } -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -uint firstbitlow(uint value) +/// Barrier for device memory. +/// @category barrier +__glsl_extension(GL_KHR_memory_scope_semantics) +[require(cuda_glsl_hlsl_metal_spirv_wgsl, memorybarrier)] +void DeviceMemoryBarrier() { __target_switch { - case cpp: __intrinsic_asm "$P_firstbitlow($0)"; - case cuda: __intrinsic_asm "$P_firstbitlow($0)"; - case glsl: __intrinsic_asm "findLSB"; - case hlsl: __intrinsic_asm "firstbitlow"; - case metal: __intrinsic_asm "ctz"; - case spirv: return spirv_asm { - OpExtInst $$uint result glsl450 FindILsb $value - }; - case wgsl: __intrinsic_asm "firstTrailingBit"; + case hlsl: __intrinsic_asm "DeviceMemoryBarrier"; + case glsl: __intrinsic_asm "memoryBarrier(gl_ScopeDevice, (gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)"; + case cuda: __intrinsic_asm "__threadfence()"; + case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture | mem_flags::mem_threadgroup_imageblock)"; + case spirv: spirv_asm + { + OpMemoryBarrier Device AcquireRelease|UniformMemory|ImageMemory; + }; + case wgsl: __intrinsic_asm "storageBarrier(); textureBarrier(); workgroupBarrier();"; } } -__generic -[__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -vector firstbitlow(vector value) +/// @category barrier +/// Barrier for device memory with group synchronization. +__glsl_extension(GL_KHR_memory_scope_semantics) +[require(cuda_glsl_hlsl_metal_spirv_wgsl, memorybarrier)] +void DeviceMemoryBarrierWithGroupSync() { __target_switch { - case glsl: __intrinsic_asm "findLSB"; - case hlsl: __intrinsic_asm "firstbitlow"; - case metal: __intrinsic_asm "ctz"; - case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 FindILsb $value - }; - case wgsl: __intrinsic_asm "firstTrailingBit"; - default: - VECTOR_MAP_UNARY(uint, N, firstbitlow, value); + case hlsl: __intrinsic_asm "DeviceMemoryBarrierWithGroupSync"; + case glsl: __intrinsic_asm "controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, (gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)"; + case cuda: __intrinsic_asm "__syncthreads()"; + case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture | mem_flags::mem_threadgroup_imageblock)"; + case spirv: spirv_asm + { + OpControlBarrier Workgroup Device AcquireRelease|UniformMemory|ImageMemory; + }; + case wgsl: __intrinsic_asm "storageBarrier(); textureBarrier(); workgroupBarrier();"; } } -/// Floor (HLSL SM 1.0). +// Vector distance /// @category math -__generic +__generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T floor(T x) +T distance(vector x, vector y) { __target_switch { - case cpp: __intrinsic_asm "$P_floor($0)"; - case cuda: __intrinsic_asm "$P_floor($0)"; - case glsl: __intrinsic_asm "floor"; - case hlsl: __intrinsic_asm "floor"; - case metal: __intrinsic_asm "floor"; + case glsl: __intrinsic_asm "distance"; + case hlsl: __intrinsic_asm "distance"; + case metal: __intrinsic_asm "distance"; case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Floor $x + OpExtInst $$T result glsl450 Distance $x $y }; - case wgsl: __intrinsic_asm "floor"; + case wgsl: __intrinsic_asm "distance"; + default: + return length(x - y); } } -__generic +__generic [__readNone] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector floor(vector x) +T distance(T x, T y) { __target_switch { - case glsl: __intrinsic_asm "floor"; - case hlsl: __intrinsic_asm "floor"; - case metal: __intrinsic_asm "floor"; + case glsl: __intrinsic_asm "distance"; case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Floor $x + OpExtInst $$T result glsl450 Distance $x $y }; - case wgsl: __intrinsic_asm "floor"; + case wgsl: __intrinsic_asm "distance"; default: - VECTOR_MAP_UNARY(T, N, floor, x); + return length(x - y); } } -__generic +/// Computes `max(0, x-y)`. +/// @category math +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix floor(matrix x) +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_4_0_version)] +T fdim(T x, T y) { __target_switch { - case hlsl: __intrinsic_asm "floor"; + case metal: __intrinsic_asm "fdim"; default: - MATRIX_MAP_UNARY(T, N, M, floor, x); + return max(T(0), x - y); } -} - -/// Fused multiply-add. -/// @category math -__generic +} + +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -T fma(T a, T b, T c) +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_4_0_version)] +vector fdim(vector x, vector y) { __target_switch { - case cpp: __intrinsic_asm "$P_fma($0, $1, $2)"; - case cuda: __intrinsic_asm "$P_fma($0, $1, $2)"; - case glsl: __intrinsic_asm "fma"; - case hlsl: - if (__isFloat() || __isHalf()) - return mad(a, b, c); - else - __intrinsic_asm "fma($0, $1, $2)"; - case metal: __intrinsic_asm "fma"; - case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Fma $a $b $c - }; - case wgsl: __intrinsic_asm "fma"; + case metal: __intrinsic_asm "fdim"; default: - return a*b + c; + return max(T(0), x - y); } } -__generic +// divide +/// @category math +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -vector fma(vector a, vector b, vector c) +[require(cpp_cuda_glsl_hlsl_metal_spirv)] +T divide(T x, T y) { __target_switch { - case glsl: __intrinsic_asm "fma"; - case hlsl: __intrinsic_asm "fma"; - case metal: __intrinsic_asm "fma"; - case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Fma $a $b $c - }; - case wgsl: __intrinsic_asm "fma"; + case metal: __intrinsic_asm "divide"; default: - VECTOR_MAP_TRINARY(T, N, fma, a, b, c); + return x / y; } } -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -matrix fma(matrix a, matrix b, matrix c) +[require(cpp_cuda_glsl_hlsl_metal_spirv)] +vector divide(vector x, vector y) { __target_switch { - case hlsl: __intrinsic_asm "fma"; + case metal: __intrinsic_asm "divide"; default: - MATRIX_MAP_TRINARY(T, N, M, fma, a, b, c); + return x / y; } } -/// Floating point remainder of x/y. -/// The floating-point remainder is calculated such that x = i * y + f, -/// where i is an integer, f has the same sign as x, and the absolute value -/// of f is less than the absolute value of y. +/// Vector dot product /// @category math __generic [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T fmod(T x, T y) +T dot(T x, T y) { - // In HLSL, `fmod` returns a remainder. - // Definition of `fmod` in HLSL is, - // "The floating-point remainder is calculated such that x = i * y + f, - // where i is an integer, f has the same sign as x, and the absolute value - // of f is less than the absolute value of y." - // - // In GLSL, `mod` is a Modulus function. - // OpenGL document defines "Modulus" as "Returns x - y * floor(x / y)". - // The use of "Floor()" makes the difference. - // - // In Metal, `fmod` is Modulus function. - // Metal document defines it as "Returns x - y * trunc(x/y)". - // Note that the function name is same to HLSL but it behaves differently. - // - // The tricky ones are when x or y is a negative value. - // - // | Remainder | Modulus - // x y | x= i*y +f | x-y*floor(x/y) - // ------+-----------+------------------------------ - // 4 3 | 4= 1*3 +1 | 4-3*floor( 4/3) = 4-3* 1 = 1 - // 3 3 | 3= 1*3 +0 | 3-3*floor( 3/3) = 3-3* 1 = 0 - // 2 3 | 2= 0*3 +2 | 2-3*floor( 2/3) = 2-3* 0 = 2 - // 1 3 | 1= 0*3 +1 | 1-3*floor( 1/3) = 1-3* 0 = 1 - // 0 3 | 0= 0*3 +0 | 0-3*floor( 0/3) = 0-3* 0 = 0 - // -1 3 |-1= 0*3 -1 |-1-3*floor(-1/3) =-1-3*-1 = 2 - // -2 3 |-2= 0*3 -2 |-2-3*floor(-2/3) =-2-3*-1 = 1 - // -3 3 |-3=-1*3 0 |-3-3*floor(-3/3) =-3-3*-1 = 0 - // -4 3 |-4=-1*3 -1 |-4-3*floor(-4/3) =-4-3*-2 = 2 - // - // When y is a negative value, - // - // | Remainder | Modulus - // x y | x= i*y +f | x-y*floor(x/y) - // ------+-----------+------------------------------ - // 4 -3 | 4=-1*-3+1 | 4+3*floor( 4/-3) = 4+3*-2 =-2 - // 3 -3 | 3=-1*-3+0 | 3+3*floor( 3/-3) = 3+3*-1 = 0 - // 2 -3 | 2= 0*-3+2 | 2+3*floor( 2/-3) = 2+3*-1 =-1 - // 1 -3 | 1= 0*-3+1 | 1+3*floor( 1/-3) = 1+3*-1 =-2 - // 0 -3 | 0= 0*-3+0 | 0+3*floor( 0/-3) = 0+3* 0 = 0 - // -1 -3 |-1= 0*-3-1 |-1+3*floor(-1/-3) =-1+3* 0 =-1 - // -2 -3 |-2= 0*-3-2 |-2+3*floor(-2/-3) =-2+3* 0 =-2 - // -3 -3 |-3= 1*-3 0 |-3+3*floor(-3/-3) =-3+3* 1 = 0 - // -4 -3 |-4= 1*-3-1 |-4+3*floor(-4/-3) =-4+3* 1 =-1 - __target_switch { - case cpp: __intrinsic_asm "$P_fmod($0, $1)"; - case cuda: __intrinsic_asm "$P_fmod($0, $1)"; - case glsl: - // GLSL doesn't have a function for remainder. - __intrinsic_asm "(($0 < 0.0) ? -mod(-$0,abs($1)) : mod($0,abs($1)))"; - case hlsl: __intrinsic_asm "fmod"; - case metal: - // Metal doesn't have a function for remainder. - __intrinsic_asm "(($0 < 0.0) ? -fmod(-$0,abs($1)) : fmod($0,abs($1)))"; - case spirv: - // OpFRem return "The floating-point remainder whose sign - // matches the sign of Operand 1", where Operand 1 is "x". - return spirv_asm - { - result:$$T = OpFRem $x $y - }; - case wgsl: - __intrinsic_asm "(($0) % ($1))"; + case glsl: __intrinsic_asm "dot"; + case hlsl: __intrinsic_asm "dot"; + case wgsl: __intrinsic_asm "dot"; + default: + return x * y; } } __generic [__readNone] -[ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector fmod(vector x, vector y) +T dot(vector x, vector y) { __target_switch { - case hlsl: __intrinsic_asm "fmod"; + case glsl: __intrinsic_asm "dot"; + case hlsl: __intrinsic_asm "dot"; + case metal: __intrinsic_asm "dot"; case spirv: return spirv_asm { - result:$$vector = OpFRem $x $y + OpDot $$T result $x $y }; + case wgsl: __intrinsic_asm "dot"; default: - VECTOR_MAP_BINARY(T, N, fmod, x, y); + T result = T(0); + for(int i = 0; i < N; ++i) + result += x[i] * y[i]; + return result; } } -__generic +__generic [__readNone] -[ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix fmod(matrix x, matrix y) +T dot(vector x, vector y) { __target_switch { - case hlsl: __intrinsic_asm "fmod"; + case hlsl: __intrinsic_asm "dot"; + case wgsl: __intrinsic_asm "dot"; default: - MATRIX_MAP_BINARY(T, N, M, fmod, x, y); + T result = T(0); + for(int i = 0; i < N; ++i) + result += x[i] * y[i]; + return result; } } -/// Extract the fractional part of a floating-point number. +/// Helper for computing distance terms for lighting (obsolete) /// @category math -__generic +/// @deprecated +__generic vector dst(vector x, vector y); + +// Given a RWByteAddressBuffer allow it to be interpreted as a RWStructuredBuffer +__intrinsic_op($(kIROp_GetEquivalentStructuredBuffer)) +RWStructuredBuffer __getEquivalentStructuredBuffer(RWByteAddressBuffer b); + +__intrinsic_op($(kIROp_GetEquivalentStructuredBuffer)) +StructuredBuffer __getEquivalentStructuredBuffer(ByteAddressBuffer b); + +__intrinsic_op($(kIROp_GetEquivalentStructuredBuffer)) +RasterizerOrderedStructuredBuffer __getEquivalentStructuredBuffer(RasterizerOrderedByteAddressBuffer b); + +// Error message + +// void errorf( string format, ... ); + +// Attribute evaluation + +// TODO: The matrix cases of these functions won't actuall work +// when compiled to GLSL, since they only support scalar/vector + +// TODO: Should these be constrains to `__BuiltinFloatingPointType`? +// TODO: SPIRV-direct does not support non-floating-point types. + +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T frac(T x) +[require(glsl_spirv, fragmentprocessing)] +T EvaluateAttributeAtCentroid(T x) { __target_switch { - case cpp: __intrinsic_asm "$P_frac($0)"; - case cuda: __intrinsic_asm "$P_frac($0)"; - case glsl: __intrinsic_asm "fract"; - case hlsl: __intrinsic_asm "frac"; - case metal: __intrinsic_asm "fract"; + case glsl: __intrinsic_asm "interpolateAtCentroid"; case spirv: return spirv_asm { - OpExtInst $$T result glsl450 Fract $x + OpExtInst $$T result glsl450 InterpolateAtCentroid $x }; - case wgsl: __intrinsic_asm "fract"; } } -__generic +__generic [__readNone] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector frac(vector x) +[require(glsl_spirv, fragmentprocessing)] +vector EvaluateAttributeAtCentroid(vector x) { __target_switch { - case glsl: __intrinsic_asm "fract"; - case hlsl: __intrinsic_asm "frac"; - case metal: __intrinsic_asm "fract"; + case glsl: __intrinsic_asm "interpolateAtCentroid"; case spirv: return spirv_asm { - OpExtInst $$vector result glsl450 Fract $x + OpExtInst $$vector result glsl450 InterpolateAtCentroid $x }; - case wgsl: __intrinsic_asm "fract"; - default: - VECTOR_MAP_UNARY(T, N, frac, x); } } -__generic -[__readNone] -matrix frac(matrix x) -{ - MATRIX_MAP_UNARY(T, N, M, frac, x); -} - -/// Extract the fractional part of a floating-point number. -/// @category math -__generic +__generic [__readNone] -[ForceInline] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T fract(T x) +[require(glsl_spirv, fragmentprocessing)] +matrix EvaluateAttributeAtCentroid(matrix x) { - return frac(x); + __target_switch + { + case glsl: __intrinsic_asm "interpolateAtCentroid"; + default: + MATRIX_MAP_UNARY(T, N, M, EvaluateAttributeAtCentroid, x); + } } -__generic +__generic [__readNone] -[ForceInline] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector fract(vector x) +[require(glsl_spirv, fragmentprocessing)] +T EvaluateAttributeAtSample(T x, uint sampleindex) { - return frac(x); + __target_switch + { + case glsl: __intrinsic_asm "interpolateAtSample($0, int($1))"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 InterpolateAtSample $x $sampleindex + }; + } } -/// Split float into mantissa and exponent. -/// @category math -__generic +__generic [__readNone] -[ForceInline] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -T frexp(T x, out int exp) +[require(glsl_spirv, fragmentprocessing)] +vector EvaluateAttributeAtSample(vector x, uint sampleindex) { __target_switch { - case cpp: __intrinsic_asm "$P_frexp($0, $1)"; - case cuda: __intrinsic_asm "$P_frexp($0, $1)"; - case glsl: __intrinsic_asm "frexp"; - case hlsl: __intrinsic_asm "frexp"; - case metal: __intrinsic_asm "frexp($0, *($1))"; + case glsl: __intrinsic_asm "interpolateAtSample($0, int($1))"; case spirv: return spirv_asm { - result:$$T = OpExtInst glsl450 Frexp $x &exp + OpExtInst $$vector result glsl450 InterpolateAtSample $x $sampleindex }; - case wgsl: - T fract; - __wgsl_frexp(x, fract, exp); - return fract; } } -__generic +__generic [__readNone] -[ForceInline] -[require(wgsl)] -void __wgsl_frexp(T x, out T fract, out int exp) +[require(glsl_spirv, fragmentprocessing)] +matrix EvaluateAttributeAtSample(matrix x, uint sampleindex) { - __intrinsic_asm "{ var s = frexp($0); ($1) = s.fract; ($2) = s.exp; }"; + __target_switch + { + case glsl: __intrinsic_asm "interpolateAtSample($0, int($1))"; + default: + matrix result; + for(int i = 0; i < N; ++i) + { + result[i] = EvaluateAttributeAtSample(x[i], sampleindex); + } + return result; + } } -__generic +__generic [__readNone] -[ForceInline] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -vector frexp(vector x, out vector exp) +[require(glsl_spirv, fragmentprocessing)] +T EvaluateAttributeSnapped(T x, int2 offset) { __target_switch { - case glsl: __intrinsic_asm "frexp"; - case hlsl: __intrinsic_asm "frexp"; - case metal: __intrinsic_asm "frexp($0, *($1))"; - case spirv: return spirv_asm { - result:$$vector = OpExtInst glsl450 Frexp $x &exp - }; - case wgsl: - vector fract; - __wgsl_frexp(x, fract, exp); - return fract; - default: - VECTOR_MAP_BINARY(T, N, frexp, x, exp); + case glsl: __intrinsic_asm "interpolateAtOffset($0, vec2($1) / 16.0f)"; + case spirv: + { + const float2 tmp = float2(16.f, 16.f); + return spirv_asm { + %foffset:$$float2 = OpConvertSToF $offset; + %offsetdiv16:$$float2 = OpFDiv %foffset $tmp; + result:$$T = OpExtInst glsl450 InterpolateAtOffset $x %offsetdiv16 + }; + } } } -__generic +__generic [__readNone] -[ForceInline] -[require(wgsl)] -void __wgsl_frexp(vector x, out vector fract, out vector exp) +[require(glsl_spirv, fragmentprocessing)] +vector EvaluateAttributeSnapped(vector x, int2 offset) { - __intrinsic_asm "{ var s = frexp($0); ($1) = s.fract; ($2) = s.exp; }"; + __target_switch + { + case glsl: __intrinsic_asm "interpolateAtOffset($0, vec2($1) / 16.0f)"; + case spirv: + { + const float2 tmp = float2(16.f, 16.f); + return spirv_asm { + %foffset:$$float2 = OpConvertSToF $offset; + %offsetdiv16:$$float2 = OpFDiv %foffset $tmp; + result:$$vector = OpExtInst glsl450 InterpolateAtOffset $x %offsetdiv16 + }; + } + } } -__generic +__generic [__readNone] -[ForceInline] -[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] -matrix frexp(matrix x, out matrix exp) +[require(glsl_spirv, fragmentprocessing)] +matrix EvaluateAttributeSnapped(matrix x, int2 offset) { __target_switch { - case hlsl: __intrinsic_asm "frexp"; + case glsl: __intrinsic_asm "interpolateAtOffset($0, vec2($1) / 16.0f)"; default: - MATRIX_MAP_BINARY(T, N, M, frexp, x, exp); + matrix result; + for(int i = 0; i < N; ++i) + { + result[i] = EvaluateAttributeSnapped(x[i], offset); + } + return result; } } -/// Texture filter width. -/// @category derivative +/// Computes base-e exponent. +/// @category math __generic [__readNone] -[require(glsl_hlsl_metal_spirv_wgsl, fragmentprocessing)] -T fwidth(T x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T exp(T x) { - __requireComputeDerivative(); __target_switch { - case hlsl: - __intrinsic_asm "fwidth($0)"; - case glsl: - __intrinsic_asm "fwidth($0)"; - case metal: - __intrinsic_asm "fwidth($0)"; - case spirv: - return spirv_asm - { - OpFwidth $$T result $x; - }; - case wgsl: - __intrinsic_asm "fwidth($0)"; + case cpp: __intrinsic_asm "$P_exp($0)"; + case cuda: __intrinsic_asm "$P_exp($0)"; + case glsl: __intrinsic_asm "exp"; + case hlsl: __intrinsic_asm "exp"; + case metal: __intrinsic_asm "exp"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Exp $x + }; + case wgsl: __intrinsic_asm "exp"; } } +/// @category math __generic [__readNone] -[require(glsl_hlsl_spirv_wgsl, fragmentprocessing)] -vector fwidth(vector x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector exp(vector x) { - __requireComputeDerivative(); __target_switch { - case hlsl: - __intrinsic_asm "fwidth($0)"; - case glsl: - __intrinsic_asm "fwidth($0)"; - case spirv: - return spirv_asm - { - OpFwidth $$vector result $x; - }; - case wgsl: - __intrinsic_asm "fwidth($0)"; + case glsl: __intrinsic_asm "exp"; + case hlsl: __intrinsic_asm "exp"; + case metal: __intrinsic_asm "exp"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Exp $x + }; + case wgsl: __intrinsic_asm "exp"; + default: + VECTOR_MAP_UNARY(T, N, exp, x); } } +/// @category math __generic [__readNone] -[require(glsl_hlsl_spirv, fragmentprocessing)] -matrix fwidth(matrix x) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +matrix exp(matrix x) { __target_switch { - case hlsl: - __intrinsic_asm "fwidth($0)"; + case hlsl: __intrinsic_asm "exp"; default: - MATRIX_MAP_UNARY(T, N, M, fwidth, x); + MATRIX_MAP_UNARY(T, N, M, exp, x); } } -__intrinsic_op($(kIROp_GetPerVertexInputArray)) -Array __GetPerVertexInputArray(T attribute); - -/// Get the value of a vertex attribute at a specific vertex. -/// -/// The `GetAttributeAtVertex()` function can be used in a fragment shader -/// to get the value of the given `attribute` at the vertex of the primitive -/// that corresponds to the given `vertexIndex`. -/// -/// Note that the `attribute` must have been a declared varying input to -/// the fragment shader with the `nointerpolation` modifier. -/// -/// This function can be applied to scalars, vectors, and matrices of -/// built-in scalar types. -/// -__generic +/// Computes base-2 exponent +/// @category math +__generic [__readNone] -__glsl_version(450) -__glsl_extension(GL_EXT_fragment_shader_barycentric) -[require(glsl_hlsl_spirv, getattributeatvertex)] -[KnownBuiltin("GetAttributeAtVertex")] -[__unsafeForceInlineEarly] -T GetAttributeAtVertex(T attribute, uint vertexIndex) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T exp2(T x) { __target_switch { - case hlsl: - __intrinsic_asm "GetAttributeAtVertex"; case glsl: + __intrinsic_asm "exp2($0)"; case spirv: - return __GetPerVertexInputArray(attribute)[vertexIndex]; + if (__isHalf()) + { + return spirv_asm { OpExtInst $$T result glsl450 Exp2 $x }; + } + else + { + float xf = __realCast(x); + return T(spirv_asm { + result:$$float = OpExtInst glsl450 Exp2 $xf + }); + } + case hlsl: + __intrinsic_asm "exp2($0)"; + case metal: __intrinsic_asm "exp2"; + case cpp: + __intrinsic_asm "$P_exp2($0)"; + case cuda: + __intrinsic_asm "$P_exp2($0)"; + case wgsl: + __intrinsic_asm "exp2"; } + } -/// Get the value of a vertex attribute at a specific vertex. -/// -/// The `GetAttributeAtVertex()` function can be used in a fragment shader -/// to get the value of the given `attribute` at the vertex of the primitive -/// that corresponds to the given `vertexIndex`. -/// -/// Note that the `attribute` must have been a declared varying input to -/// the fragment shader with the `nointerpolation` modifier. -/// -/// This function can be applied to scalars, vectors, and matrices of -/// built-in scalar types. -/// -__generic +__generic [__readNone] -__glsl_version(450) -__glsl_extension(GL_EXT_fragment_shader_barycentric) -[require(glsl_hlsl_spirv, getattributeatvertex)] -vector GetAttributeAtVertex(vector attribute, uint vertexIndex) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector exp2(vector x) { __target_switch { - case hlsl: - __intrinsic_asm "GetAttributeAtVertex"; - case glsl: - __intrinsic_asm "$0[$1]"; - case spirv: - return spirv_asm { - %_ptr_Input_vectorT = OpTypePointer Input $$vector; - %addr = OpAccessChain %_ptr_Input_vectorT $attribute $vertexIndex; - result:$$vector = OpLoad %addr; - }; + case glsl: + __intrinsic_asm "exp2($0)"; + case hlsl: __intrinsic_asm "exp2"; + case metal: __intrinsic_asm "exp2"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Exp2 $x + }; + case wgsl: __intrinsic_asm "exp2"; + default: + VECTOR_MAP_UNARY(T, N, exp2, x); } } -/// Get the value of a vertex attribute at a specific vertex. -/// -/// The `GetAttributeAtVertex()` function can be used in a fragment shader -/// to get the value of the given `attribute` at the vertex of the primitive -/// that corresponds to the given `vertexIndex`. -/// -/// Note that the `attribute` must have been a declared varying input to -/// the fragment shader with the `nointerpolation` modifier. -/// -/// This function can be applied to scalars, vectors, and matrices of -/// built-in scalar types. -/// -__generic +__generic [__readNone] -__glsl_version(450) -__glsl_extension(GL_EXT_fragment_shader_barycentric) -[require(glsl_hlsl_spirv, getattributeatvertex)] -matrix GetAttributeAtVertex(matrix attribute, uint vertexIndex) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +matrix exp2(matrix x) { __target_switch { - case hlsl: - __intrinsic_asm "GetAttributeAtVertex"; - case glsl: - __intrinsic_asm "$0[$1]"; - case spirv: - return spirv_asm { - %_ptr_Input_matrixT = OpTypePointer Input $$matrix; - %addr = OpAccessChain %_ptr_Input_matrixT $attribute $vertexIndex; - result:$$matrix = OpLoad %addr; - }; + case hlsl: __intrinsic_asm "exp2"; + default: + MATRIX_MAP_UNARY(T, N, M, exp2, x); } } -// Get number of samples in render target +/// Computes base-10 exponent +/// @category math +__generic [__readNone] -[require(hlsl, sm_4_0)] -[require(metal)] -uint GetRenderTargetSampleCount() +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T exp10(T x) { __target_switch { - case hlsl: __intrinsic_asm "GetRenderTargetSampleCount"; - case metal: __intrinsic_asm "get_num_samples"; + case metal: __intrinsic_asm "exp10"; + default: + const T ln10 = T(2.302585092994045901); // ln(10) + return exp(x * ln10); } } -// Get position of given sample +__generic [__readNone] -[require(hlsl, sm_4_0)] -[require(metal)] -float2 GetRenderTargetSamplePosition(int Index) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector exp10(vector x) { __target_switch { - case hlsl: __intrinsic_asm "GetRenderTargetSamplePosition"; - case metal: __intrinsic_asm "get_sample_position"; + case metal: __intrinsic_asm "exp10"; + default: + const T ln10 = T(2.30258509299); // ln(10) + return exp(x * ln10); } } -/// Group memory barrier. Ensures that all memory accesses in the group are visible to all threads in the group. -/// @category barrier -__glsl_extension(GL_KHR_memory_scope_semantics) -[require(cuda_glsl_hlsl_metal_spirv_wgsl, memorybarrier)] -void GroupMemoryBarrier() + +/// Convert 16-bit float stored in low bits of integer +/// @category conversion Conversion functions +__glsl_version(420) +__cuda_sm_version(6.0) +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +float f16tof32(uint value) { __target_switch { - case glsl: __intrinsic_asm "memoryBarrier(gl_ScopeWorkgroup, gl_StorageSemanticsShared, gl_SemanticsAcquireRelease)"; - case hlsl: __intrinsic_asm "GroupMemoryBarrier"; - case cuda: __intrinsic_asm "__threadfence_block"; - case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_threadgroup)"; + case glsl: __intrinsic_asm "unpackHalf2x16($0).x"; + case hlsl: __intrinsic_asm "f16tof32($0)"; + case cuda: __intrinsic_asm "__half2float(__ushort_as_half($0))"; + case cpp: __intrinsic_asm "f16tof32($0)"; + case metal: __intrinsic_asm "as_type((ushort)($0))"; case spirv: - spirv_asm - { - OpMemoryBarrier Workgroup AcquireRelease|WorkgroupMemory + { + return spirv_asm { + %lowBits = OpUConvert $$uint16_t $value; + %half = OpBitcast $$half %lowBits; + result:$$float = OpFConvert %half }; - case wgsl: __intrinsic_asm "workgroupBarrier"; + } + case wgsl: __intrinsic_asm "unpack2x16float($0).x"; } } -[require(cuda_glsl_hlsl_metal_spirv, memorybarrier)] -void __subgroupBarrier() +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +vector f16tof32(vector value) { __target_switch { - case glsl: __intrinsic_asm "subgroupBarrier"; - case hlsl: __intrinsic_asm "GroupMemoryBarrierWithGroupSync"; - case cuda: __intrinsic_asm "__syncthreads()"; - case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_threadgroup)"; + case hlsl: __intrinsic_asm "f16tof32"; case spirv: - spirv_asm - { - OpControlBarrier Subgroup Subgroup AcquireRelease|WorkgroupMemory|ImageMemory|UniformMemory + { + return spirv_asm { + %lowBits = OpUConvert $$vector $value; + %half = OpBitcast $$vector %lowBits; + result:$$vector = OpFConvert %half }; } + default: + VECTOR_MAP_UNARY(float, N, f16tof32, value); + } } -/// Group memory barrier. Ensures that all memory accesses in the group are visible to all threads in the group. -/// @category barrier -__glsl_extension(GL_KHR_memory_scope_semantics) -[require(cuda_glsl_hlsl_metal_spirv_wgsl, memorybarrier)] -void GroupMemoryBarrierWithGroupSync() +/// Convert to 16-bit float stored in low bits of integer. +/// @category conversion +__glsl_version(420) +__cuda_sm_version(6.0) +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint f32tof16(float value) { __target_switch { - case glsl: __intrinsic_asm "controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup, gl_StorageSemanticsShared, gl_SemanticsAcquireRelease)"; - case hlsl: __intrinsic_asm "GroupMemoryBarrierWithGroupSync"; - case cuda: __intrinsic_asm "__syncthreads()"; - case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_threadgroup)"; + case glsl: __intrinsic_asm "packHalf2x16(vec2($0,0.0))"; + case hlsl: __intrinsic_asm "f32tof16($0)"; + case cuda: __intrinsic_asm "__half_as_ushort(__float2half($0))"; + case cpp: __intrinsic_asm "f32tof16($0)"; + case metal: __intrinsic_asm "as_type((half)($0))"; case spirv: - spirv_asm - { - OpControlBarrier Workgroup Workgroup AcquireRelease|WorkgroupMemory + { + return spirv_asm { + %half = OpFConvert $$half $value; + %lowBits = OpBitcast $$uint16_t %half; + result:$$uint = OpUConvert %lowBits }; - case wgsl: __intrinsic_asm "workgroupBarrier"; + } + case wgsl: __intrinsic_asm "pack2x16float(vec2f($0,0.0))"; } } -// Atomics - -__generic -__intrinsic_op($(kIROp_MetalAtomicCast)) -[require(metal)] -T* __getMetalAtomicRef(__ref T x); - -// Checks if input is a ImageSubscript -__generic -__intrinsic_op($(kIROp_IsTextureAccess)) -bool __isTextureAccess(__ref T x); - -// Checks if input is a texture of T type scalar -__generic -__intrinsic_op($(kIROp_IsTextureScalarAccess)) -bool __isTextureScalarAccess(__ref T x); - -// Checks if input is a texture array -__generic -__intrinsic_op($(kIROp_IsTextureArrayAccess)) -bool __isTextureArrayAccess(__ref T x); - -// Accepts an ImageSubscript -// Gets Texture used with ImageSubscript. -__generic -__intrinsic_op($(kIROp_ExtractTextureFromTextureAccess)) -TextureAccess* __extractTextureFromTextureAccess(__ref TextureAccess x); - -// Accepts an ImageSubscript -// Gets Coord from ImageSubscript. Swizzles out ArrayCoord if applicable -__generic -__intrinsic_op($(kIROp_ExtractCoordFromTextureAccess)) -uint __extractCoordFromTextureAccess(__ref TextureAccess x); - -// Accepts an ImageSubscript -// Gets ArrayCoord from ImageSubscript -__generic -__intrinsic_op($(kIROp_ExtractArrayCoordFromTextureAccess)) -uint __extractArrayCoordFromTextureAccess(__ref TextureAccess x); - -${{{{ -for (bool isArray : {false, true}) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +vector f32tof16(vector value) { - StringBuilder coordBuilder; - StringBuilder coordFetchBuilder; - - StringBuilder threeParamsASMBuilder; - StringBuilder threeParamsOutputParamASMBuilder; - - StringBuilder fourParamsASMBuilder; - - coordBuilder << "Coord coord"; - coordFetchBuilder << "coord"; - - threeParamsASMBuilder << "$1, $2"; - - fourParamsASMBuilder << "$1, $2, $3"; - if(isArray) - { - coordBuilder << ", uint arrayCoord"; - coordFetchBuilder << ", arrayCoord"; - threeParamsASMBuilder << ", $3"; - fourParamsASMBuilder << ", $4"; - threeParamsOutputParamASMBuilder << "$4"; - } - else - { - threeParamsOutputParamASMBuilder << "$3"; - } - auto coordString = coordBuilder.toString(); - auto coordFetchString = coordFetchBuilder.toString(); - - auto threeParamsASMString = threeParamsASMBuilder.toString(); - auto threeParamsOutputParamASMString = threeParamsOutputParamASMBuilder.toString(); - - auto fourParamsASMString = fourParamsASMBuilder.toString(); -}}}} - -${{{{ - for (const char* atomicOperation : {"add", "and", "max", "min", "or", "sub", "xor"}) - { -}}}} - __generic - [ForceInline] - [require(metal)] - vector __metalImageInterlocked_$(atomicOperation)(TextureType tex, $(coordString), vector value) - { - static_assert(T is int || T is uint, "__metalImageInterlocked only allows 'int'/'uint' textures"); - static_assert(Coord is uint || Coord is vector || Coord is vector || Coord is vector, - "__metalImageInterlocked implementation only allows 'uint' coordinates"); - __intrinsic_asm "$0.atomic_fetch_$(atomicOperation)($(threeParamsASMString))"; - } - - __generic - [ForceInline] - [require(metal)] - void __metalImageInterlocked_$(atomicOperation)(TextureType tex, $(coordString), vector value, out T original_value) - { - static_assert(T is int || T is uint, "__metalImageInterlocked only allows 'int'/'uint' textures"); - static_assert(Coord is uint || Coord is vector || Coord is vector || Coord is vector, - "__metalImageInterlocked implementation only allows 'uint' coordinates"); - original_value = __metalImageInterlocked_$(atomicOperation)(tex, $(coordFetchString), value)[0]; - } -${{{{ - } // atomicOperation -}}}} - - __generic - [ForceInline] - [require(metal)] - vector __metalImageInterlocked_exchange(TextureType tex, $(coordString), vector value) - { - static_assert(T is int || T is uint, "__metalImageInterlocked only allows 'int'/'uint' textures"); - static_assert(Coord is uint || Coord is vector || Coord is vector || Coord is vector, - "__metalImageInterlocked implementation only allows 'uint' coordinates"); - __intrinsic_asm "($0.atomic_exchange($(threeParamsASMString)))"; - } - __generic - [ForceInline] - [require(metal)] - void __metalImageInterlocked_exchange(TextureType tex, $(coordString), vector value, out T original_value) + __target_switch { - static_assert(T is int || T is uint, "Metal atomic texture operations only allow 'int'/'uint' textures"); - static_assert(Coord is uint || Coord is vector || Coord is vector || Coord is vector, - "__metalImageInterlocked implementation only allows 'uint' coordinates"); - original_value = __metalImageInterlocked_exchange(tex, $(coordFetchString), value)[0]; - } - - __generic - [ForceInline] - [require(metal)] - void __metalImageInterlocked_compare_exchange(TextureType tex, $(coordString), __ref vector compare_value, vector value) + case hlsl: __intrinsic_asm "f32tof16"; + case spirv: { - static_assert(T is int || T is uint, "__metalImageInterlocked only allows 'int'/'uint' textures"); - static_assert(Coord is uint || Coord is vector || Coord is vector || Coord is vector, - "__metalImageInterlocked implementation only allows 'uint' coordinates"); - __intrinsic_asm "($0.atomic_compare_exchange_weak($(fourParamsASMString)))"; + return spirv_asm { + %half = OpFConvert $$vector $value; + %lowBits = OpBitcast $$vector %half; + result:$$vector = OpUConvert %lowBits + }; } - __generic - [ForceInline] - [require(metal)] - void __metalImageInterlocked_compare_exchange(TextureType tex, $(coordString), vector compare_value, vector value, out T original_value) - { - static_assert(T is int || T is uint, "__metalImageInterlocked only allows 'int'/'uint' textures"); - static_assert(Coord is uint || Coord is vector || Coord is vector || Coord is vector, - "__metalImageInterlocked implementation only allows 'uint' coordinates"); - __metalImageInterlocked_compare_exchange(tex, $(coordFetchString), compare_value, value); - original_value = compare_value[0]; + default: + VECTOR_MAP_UNARY(uint, N, f32tof16, value); } +} -${{{{ -} // isArray -}}}} - -${{{{ - -// Generated functions: - -// atomicAdd, InterlockedAdd, atomic_fetch_add_explicit, OpAtomicIAdd, OpAtomicFAddEXT -// __cudaInterlocked_add, __glslInterlocked_add, __hlslInterlocked_add, __metalInterlocked_add, __spirvInterlocked_add - -// atomicAnd, InterlockedAnd, atomic_fetch_and_explicit, OpAtomicAnd -// __cudaInterlocked_and, __glslInterlocked_and, __hlslInterlocked_and, __metalInterlocked_and, __spirvInterlocked_and - -// atomicMax, InterlockedMax, atomic_fetch_max_explicit, OpAtomicUMax, OpAtomicSMax, OpAtomicFMaxEXT -// __cudaInterlocked_max, __glslInterlocked_max, __hlslInterlocked_max, __metalInterlocked_max, __spirvInterlocked_max - -// atomicMin, InterlockedMin, atomic_fetch_min_explicit, OpAtomicUMin, OpAtomicSMin, OpAtomicFMinEXT -// __cudaInterlocked_min, __glslInterlocked_min, __hlslInterlocked_min, __metalInterlocked_min, __spirvInterlocked_min - -// atomicOr, InterlockedOr, atomic_fetch_or_explicit, OpAtomicOr -// __cudaInterlocked_or, __glslInterlocked_or, __hlslInterlocked_or, __metalInterlocked_or, __spirvInterlocked_or - -// atomicXor, InterlockedXor, atomic_fetch_xor_explicit, OpAtomicXor -// __cudaInterlocked_xor, __glslInterlocked_xor, __hlslInterlocked_xor, __metalInterlocked_xor, __spirvInterlocked_xor - -// atomicExchange, atomicExch, InterlockedExchange, atomic_exchange_explicit, OpAtomicExchange -// __cudaInterlocked_exchange, __glslInterlocked_exchange, __hlslInterlocked_exchange, __metalInterlocked_exchange, __spirvInterlocked_exchange - -struct InternalAtomicOperationInfo -{ - const char* slangSuffix; - const char* cudaSuffix; - const char* glslSuffix; - const char* hlslSuffix; - const char* metalSuffix; - const char* spirvFloatSuffix; - const char* spirvUIntSuffix; - const char* spirvIntSuffix; - - const char* assertExpr; -}; - -InternalAtomicOperationInfo internalAtomicOperationInfo[7] = { - { "add", "Add", "Add", "Add", "fetch_add", "FAddEXT", "IAdd", "IAdd", "true" }, - { "and", "And", "And", "And", "fetch_and", "And", "And", "And", "!__isFloat()" }, - { "max", "Max", "Max", "Max", "fetch_max", "FMaxEXT", "UMax", "SMax", "true" }, - { "min", "Min", "Min", "Min", "fetch_min", "FMinEXT", "UMin", "SMin", "true" }, - { "or", "Or", "Or", "Or", "fetch_or", "Or", "Or", "Or", "!__isFloat()" }, - { "xor", "Xor", "Xor", "Xor", "fetch_xor", "Xor", "Xor", "Xor", "!__isFloat()" }, - { "exchange", "Exch", "Exchange", "Exchange", "exchange", "Exchange", "Exchange", "Exchange", "true" }, -}; +// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +// The following is Slang specific and NOT part of standard HLSL +// It's not clear what happens with float16 time in HLSL -> can the float16 coerce to uint for example? If so that would +// give the wrong result -for (InternalAtomicOperationInfo atomicOp : internalAtomicOperationInfo) +__glsl_version(420) +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +float f16tof32(float16_t value) { -}}}} - __generic - [ForceInline] - [require(metal)] - void __metalInterlocked_$(atomicOp.slangSuffix)(AtomicType dest, T value) + __target_switch { - static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); - __intrinsic_asm "atomic_$(atomicOp.metalSuffix)_explicit($0, $1, memory_order_relaxed)"; - } - - __generic - [ForceInline] - [require(metal)] - void __metalInterlocked_$(atomicOp.slangSuffix)(AtomicType dest, T value, out T original_value) + case glsl: __intrinsic_asm "unpackHalf2x16($0).x"; + case hlsl: __intrinsic_asm "f16tof32($0)"; + case cuda: __intrinsic_asm "__half2float($0)"; + case cpp: __intrinsic_asm "f16tof32($0)"; + case metal: __intrinsic_asm "float($0)"; + case spirv: { - static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); - __intrinsic_asm "((*($2)) = (atomic_$(atomicOp.metalSuffix)_explicit($0, $1, memory_order_relaxed)))"; + return spirv_asm { + result:$$float = OpFConvert $value + }; + } + case wgsl: __intrinsic_asm "f32($0)"; } +} - __generic - [ForceInline] - [require(cuda)] - void __cudaInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +vector f16tof32(vector value) +{ + __target_switch { - static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); - __intrinsic_asm "atomic$(atomicOp.cudaSuffix)((int*)$0, $1)"; + case cuda: __intrinsic_asm "__half2float"; + case hlsl: __intrinsic_asm "f16tof32"; + case metal: __intrinsic_asm "$TR($0)"; + case spirv: return spirv_asm { + OpFConvert $$vector result $value + }; + default: + VECTOR_MAP_UNARY(float, N, f16tof32, value); } +} - __generic - [ForceInline] - [require(cuda)] - void __cudaInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value, out T original_value) +/// Convert to float16_t. +/// @category conversion +__glsl_version(420) +[__readNone] +[require(cuda_glsl_metal_spirv_wgsl, shader5_sm_5_0)] +float16_t f32tof16_(float value) +{ + __target_switch { - static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); - __intrinsic_asm "(*$2 = atomic$(atomicOp.cudaSuffix)((int*)$0, $1))"; + case cuda: __intrinsic_asm "__float2half"; + case glsl: __intrinsic_asm "packHalf2x16(vec2($0,0.0))"; + case metal: __intrinsic_asm "half($0)"; + case spirv: return spirv_asm { + OpFConvert $$float16_t result $value + }; + case wgsl: __intrinsic_asm "f16($0)"; } +} - __generic - [ForceInline] - [require(glsl)] - void __glslInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value) +__generic +[__readNone] +[require(cuda_glsl_metal_spirv_wgsl, shader5_sm_5_0)] +vector f32tof16_(vector value) +{ + __target_switch { - static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); - __intrinsic_asm "$atomic$(atomicOp.glslSuffix)($A, $1)"; + case cuda: __intrinsic_asm "__float2half"; + case metal: __intrinsic_asm "$TR($0)"; + case spirv: return spirv_asm { + OpFConvert $$vector result $value + }; + default: + VECTOR_MAP_UNARY(float16_t, N, f32tof16_, value); } +} - __generic - [ForceInline] - [require(glsl)] - void __glslInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value, out T original_value) +// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +/// Flip surface normal to face forward, if needed. +/// @category math +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector faceforward(vector n, vector i, vector ng) +{ + __target_switch { - static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); - __intrinsic_asm "($2 = $atomic$(atomicOp.glslSuffix)($A, $1))"; + case glsl: __intrinsic_asm "faceforward"; + case hlsl: __intrinsic_asm "faceforward"; + case metal: __intrinsic_asm "faceforward"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 FaceForward $n $i $ng + }; + case wgsl: __intrinsic_asm "faceForward"; + default: + return dot(ng, i) < T(0.0f) ? n : -n; } +} - __generic - [ForceInline] - [require(hlsl)] - void __hlslInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value) +/// Find first set bit starting at high bit and working down. +/// @category bitops Bit operation functions +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +int firstbithigh(int value) +{ + __target_switch { - static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); - __intrinsic_asm "Interlocked$(atomicOp.hlslSuffix)"; + case cpp: __intrinsic_asm "$P_firstbithigh($0)"; + case cuda: __intrinsic_asm "$P_firstbithigh($0)"; + case glsl: __intrinsic_asm "findMSB"; + case hlsl: __intrinsic_asm "firstbithigh"; + case metal: __intrinsic_asm "clz"; + case spirv: return spirv_asm { + OpExtInst $$int result glsl450 FindSMsb $value + }; + case wgsl: __intrinsic_asm "firstLeadingBit"; } +} - __generic - [ForceInline] - [require(hlsl)] - void __hlslInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value, out T original_value) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +vector firstbithigh(vector value) +{ + __target_switch { - static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); - __intrinsic_asm "Interlocked$(atomicOp.hlslSuffix)"; + case glsl: __intrinsic_asm "findMSB"; + case hlsl: __intrinsic_asm "firstbithigh"; + case metal: __intrinsic_asm "clz"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 FindSMsb $value + }; + case wgsl: __intrinsic_asm "firstLeadingBit"; + default: + VECTOR_MAP_UNARY(int, N, firstbithigh, value); } +} - __generic - [ForceInline] - [require(spirv)] - void __spirvInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value) +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint firstbithigh(uint value) +{ + __target_switch { - static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); - if (__isFloat()) - { - spirv_asm - { - result:$$T = OpAtomic$(atomicOp.spirvFloatSuffix) &dest Device None $value - }; - } - else if (__isUnsignedInt()) - { - spirv_asm - { - result:$$T = OpAtomic$(atomicOp.spirvUIntSuffix) &dest Device None $value - }; - } - else if (__isInt()) - { - spirv_asm - { - result:$$T = OpAtomic$(atomicOp.spirvIntSuffix) &dest Device None $value - }; - } + case cpp: __intrinsic_asm "$P_firstbithigh($0)"; + case cuda: __intrinsic_asm "$P_firstbithigh($0)"; + case glsl: __intrinsic_asm "findMSB"; + case hlsl: __intrinsic_asm "firstbithigh"; + case metal: __intrinsic_asm "clz"; + case spirv: return spirv_asm { + OpExtInst $$uint result glsl450 FindUMsb $value + }; + case wgsl: __intrinsic_asm "firstLeadingBit"; } +} - __generic - [ForceInline] - [require(spirv)] - void __spirvInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value, out T original_value) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +vector firstbithigh(vector value) +{ + __target_switch { - static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); - if (__isFloat()) - { - spirv_asm - { - %original:$$T = OpAtomic$(atomicOp.spirvFloatSuffix) &dest Device None $value; - OpStore &original_value %original - }; - } - else if (__isUnsignedInt()) - { - spirv_asm - { - %original:$$T = OpAtomic$(atomicOp.spirvUIntSuffix) &dest Device None $value; - OpStore &original_value %original - }; - } - else if (__isInt()) - { - spirv_asm - { - %original:$$T = OpAtomic$(atomicOp.spirvIntSuffix) &dest Device None $value; - OpStore &original_value %original - }; - } + case glsl: __intrinsic_asm "findMSB"; + case hlsl: __intrinsic_asm "firstbithigh"; + case metal: __intrinsic_asm "clz"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 FindUMsb $value + }; + case wgsl: __intrinsic_asm "firstLeadingBit"; + default: + VECTOR_MAP_UNARY(uint, N, firstbithigh, value); } +} -${{{{ -} // fetchAndModify -}}}} - -__generic -[ForceInline] -[require(metal)] -void __metalInterlocked_compare_exchange(AtomicType dest, __ref T compare_value, T value) +/// Find first set bit starting at low bit and working up. +/// @category bitops +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +int firstbitlow(int value) { - __intrinsic_asm "atomic_compare_exchange_weak_explicit($0, $1, $2, memory_order_relaxed, memory_order_relaxed)"; + __target_switch + { + case cpp: __intrinsic_asm "$P_firstbitlow($0)"; + case cuda: __intrinsic_asm "$P_firstbitlow($0)"; + case glsl: __intrinsic_asm "findLSB"; + case hlsl: __intrinsic_asm "firstbitlow"; + case metal: __intrinsic_asm "ctz"; + case spirv: return spirv_asm { + OpExtInst $$int result glsl450 FindILsb $value + }; + case wgsl: __intrinsic_asm "firstTrailingBit"; + } } -__generic -[ForceInline] -[require(metal)] -void __metalInterlocked_compare_exchange(AtomicType dest, T compare_value, T value, out T original_value) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +vector firstbitlow(vector value) { - __metalInterlocked_compare_exchange(dest, compare_value, value); - original_value = compare_value; + __target_switch + { + case glsl: __intrinsic_asm "findLSB"; + case hlsl: __intrinsic_asm "firstbitlow"; + case metal: __intrinsic_asm "ctz"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 FindILsb $value + }; + case wgsl: __intrinsic_asm "firstTrailingBit"; + default: + VECTOR_MAP_UNARY(int, N, firstbitlow, value); + } } -__generic -__glsl_version(430) -[ForceInline] -[require(cuda)] -void __cudaInterlocked_compare_exchange(__ref T dest, __ref T compare_value, T value) +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint firstbitlow(uint value) { - __intrinsic_asm "atomicCAS($0, $1, $2)"; + __target_switch + { + case cpp: __intrinsic_asm "$P_firstbitlow($0)"; + case cuda: __intrinsic_asm "$P_firstbitlow($0)"; + case glsl: __intrinsic_asm "findLSB"; + case hlsl: __intrinsic_asm "firstbitlow"; + case metal: __intrinsic_asm "ctz"; + case spirv: return spirv_asm { + OpExtInst $$uint result glsl450 FindILsb $value + }; + case wgsl: __intrinsic_asm "firstTrailingBit"; + } } -__generic -[ForceInline] -[require(cuda)] -void __cudaInterlocked_compare_exchange(__ref T dest, T compare_value, T value, out T original_value) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +vector firstbitlow(vector value) { - __intrinsic_asm "*$3 = atomicCAS($0, $1, $2)"; + __target_switch + { + case glsl: __intrinsic_asm "findLSB"; + case hlsl: __intrinsic_asm "firstbitlow"; + case metal: __intrinsic_asm "ctz"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 FindILsb $value + }; + case wgsl: __intrinsic_asm "firstTrailingBit"; + default: + VECTOR_MAP_UNARY(uint, N, firstbitlow, value); + } } -__generic -[ForceInline] -[require(glsl)] -void __glslInterlocked_compare_exchange(__ref T dest, __ref T compare_value, T value) +/// Floor (HLSL SM 1.0). +/// @category math +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T floor(T x) { - __intrinsic_asm "$atomicCompSwap($A, $1, $2)"; + __target_switch + { + case cpp: __intrinsic_asm "$P_floor($0)"; + case cuda: __intrinsic_asm "$P_floor($0)"; + case glsl: __intrinsic_asm "floor"; + case hlsl: __intrinsic_asm "floor"; + case metal: __intrinsic_asm "floor"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Floor $x + }; + case wgsl: __intrinsic_asm "floor"; + } } -__generic -[ForceInline] -[require(glsl)] -void __glslInterlocked_compare_exchange(__ref T dest, T compare_value, T value, out T original_value) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector floor(vector x) { - __intrinsic_asm "($3 = $atomicCompSwap($A, $1, $2))"; + __target_switch + { + case glsl: __intrinsic_asm "floor"; + case hlsl: __intrinsic_asm "floor"; + case metal: __intrinsic_asm "floor"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Floor $x + }; + case wgsl: __intrinsic_asm "floor"; + default: + VECTOR_MAP_UNARY(T, N, floor, x); + } } -__generic -[ForceInline] -[require(hlsl)] -void __hlslInterlocked_compare_exchange(__ref T dest, __ref T compare_value, T value) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +matrix floor(matrix x) { - __intrinsic_asm "InterlockedCompareExchange"; + __target_switch + { + case hlsl: __intrinsic_asm "floor"; + default: + MATRIX_MAP_UNARY(T, N, M, floor, x); + } } -__generic -[ForceInline] -[require(hlsl)] -void __hlslInterlocked_compare_exchange(__ref T dest, T compare_value, T value, out T original_value) +/// Fused multiply-add. +/// @category math +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +T fma(T a, T b, T c) { - __intrinsic_asm "InterlockedCompareExchange"; + __target_switch + { + case cpp: __intrinsic_asm "$P_fma($0, $1, $2)"; + case cuda: __intrinsic_asm "$P_fma($0, $1, $2)"; + case glsl: __intrinsic_asm "fma"; + case hlsl: + if (__isFloat() || __isHalf()) + return mad(a, b, c); + else + __intrinsic_asm "fma($0, $1, $2)"; + case metal: __intrinsic_asm "fma"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Fma $a $b $c + }; + case wgsl: __intrinsic_asm "fma"; + default: + return a*b + c; + } } -__generic -[ForceInline] -[require(spirv)] -void __spirvInterlocked_compare_exchange(__ref T dest, __ref T compare_value, T value) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +vector fma(vector a, vector b, vector c) { - spirv_asm + __target_switch { - %result:$$T = OpAtomicCompareExchange &dest Device None None $value $compare_value; + case glsl: __intrinsic_asm "fma"; + case hlsl: __intrinsic_asm "fma"; + case metal: __intrinsic_asm "fma"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Fma $a $b $c }; + case wgsl: __intrinsic_asm "fma"; + default: + VECTOR_MAP_TRINARY(T, N, fma, a, b, c); + } } -__generic -[ForceInline] -[require(spirv)] -void __spirvInterlocked_compare_exchange(__ref T dest, T compare_value, T value, out T original_value) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +matrix fma(matrix a, matrix b, matrix c) { - spirv_asm + __target_switch { - %original:$$T = OpAtomicCompareExchange &dest Device None None $value $compare_value; - OpStore &original_value %original - }; + case hlsl: __intrinsic_asm "fma"; + default: + MATRIX_MAP_TRINARY(T, N, M, fma, a, b, c); + } } -__generic +/// Floating point remainder of x/y. +/// The floating-point remainder is calculated such that x = i * y + f, +/// where i is an integer, f has the same sign as x, and the absolute value +/// of f is less than the absolute value of y. +/// @category math +__generic +[__readNone] [ForceInline] -[require(hlsl)] -void __hlslInterlocked_compare_exchange_float_bitwise(__ref T dest, T compare_value, T value) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T fmod(T x, T y) { - __intrinsic_asm "InterlockedCompareExchangeFloatBitwise"; + // In HLSL, `fmod` returns a remainder. + // Definition of `fmod` in HLSL is, + // "The floating-point remainder is calculated such that x = i * y + f, + // where i is an integer, f has the same sign as x, and the absolute value + // of f is less than the absolute value of y." + // + // In GLSL, `mod` is a Modulus function. + // OpenGL document defines "Modulus" as "Returns x - y * floor(x / y)". + // The use of "Floor()" makes the difference. + // + // In Metal, `fmod` is Modulus function. + // Metal document defines it as "Returns x - y * trunc(x/y)". + // Note that the function name is same to HLSL but it behaves differently. + // + // The tricky ones are when x or y is a negative value. + // + // | Remainder | Modulus + // x y | x= i*y +f | x-y*floor(x/y) + // ------+-----------+------------------------------ + // 4 3 | 4= 1*3 +1 | 4-3*floor( 4/3) = 4-3* 1 = 1 + // 3 3 | 3= 1*3 +0 | 3-3*floor( 3/3) = 3-3* 1 = 0 + // 2 3 | 2= 0*3 +2 | 2-3*floor( 2/3) = 2-3* 0 = 2 + // 1 3 | 1= 0*3 +1 | 1-3*floor( 1/3) = 1-3* 0 = 1 + // 0 3 | 0= 0*3 +0 | 0-3*floor( 0/3) = 0-3* 0 = 0 + // -1 3 |-1= 0*3 -1 |-1-3*floor(-1/3) =-1-3*-1 = 2 + // -2 3 |-2= 0*3 -2 |-2-3*floor(-2/3) =-2-3*-1 = 1 + // -3 3 |-3=-1*3 0 |-3-3*floor(-3/3) =-3-3*-1 = 0 + // -4 3 |-4=-1*3 -1 |-4-3*floor(-4/3) =-4-3*-2 = 2 + // + // When y is a negative value, + // + // | Remainder | Modulus + // x y | x= i*y +f | x-y*floor(x/y) + // ------+-----------+------------------------------ + // 4 -3 | 4=-1*-3+1 | 4+3*floor( 4/-3) = 4+3*-2 =-2 + // 3 -3 | 3=-1*-3+0 | 3+3*floor( 3/-3) = 3+3*-1 = 0 + // 2 -3 | 2= 0*-3+2 | 2+3*floor( 2/-3) = 2+3*-1 =-1 + // 1 -3 | 1= 0*-3+1 | 1+3*floor( 1/-3) = 1+3*-1 =-2 + // 0 -3 | 0= 0*-3+0 | 0+3*floor( 0/-3) = 0+3* 0 = 0 + // -1 -3 |-1= 0*-3-1 |-1+3*floor(-1/-3) =-1+3* 0 =-1 + // -2 -3 |-2= 0*-3-2 |-2+3*floor(-2/-3) =-2+3* 0 =-2 + // -3 -3 |-3= 1*-3 0 |-3+3*floor(-3/-3) =-3+3* 1 = 0 + // -4 -3 |-4= 1*-3-1 |-4+3*floor(-4/-3) =-4+3* 1 =-1 + + __target_switch + { + case cpp: __intrinsic_asm "$P_fmod($0, $1)"; + case cuda: __intrinsic_asm "$P_fmod($0, $1)"; + case glsl: + // GLSL doesn't have a function for remainder. + __intrinsic_asm "(($0 < 0.0) ? -mod(-$0,abs($1)) : mod($0,abs($1)))"; + case hlsl: __intrinsic_asm "fmod"; + case metal: + // Metal doesn't have a function for remainder. + __intrinsic_asm "(($0 < 0.0) ? -fmod(-$0,abs($1)) : fmod($0,abs($1)))"; + case spirv: + // OpFRem return "The floating-point remainder whose sign + // matches the sign of Operand 1", where Operand 1 is "x". + return spirv_asm + { + result:$$T = OpFRem $x $y + }; + case wgsl: + __intrinsic_asm "(($0) % ($1))"; + } } -__generic +__generic +[__readNone] [ForceInline] -[require(hlsl)] -void __hlslInterlocked_compare_exchange_float_bitwise(__ref T dest, T compare_value, T value, out T original_value) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector fmod(vector x, vector y) { - __intrinsic_asm "InterlockedCompareExchangeFloatBitwise"; + __target_switch + { + case hlsl: __intrinsic_asm "fmod"; + case spirv: return spirv_asm { + result:$$vector = OpFRem $x $y + }; + default: + VECTOR_MAP_BINARY(T, N, fmod, x, y); + } } -${{{{ -// Generates code for: -// InterlockedAdd, InterlockedAnd, InterlockedOr, InterlockedXor, -// InterlockedMax, InterlockedMin, InterlockedExchange -struct SlangAtomicOperationInfo -{ - const char* slangCallSuffix; - const char* internalCallSuffix; -}; - -SlangAtomicOperationInfo slangAtomicOperationInfo[7] = { - { "Add", "add" }, - { "And", "and" }, - { "Or", "or" }, - { "Xor", "xor" }, - { "Max", "max" }, - { "Min", "min" }, - { "Exchange", "exchange" }, -}; - -for (SlangAtomicOperationInfo atomicOp : slangAtomicOperationInfo) +__generic +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +matrix fmod(matrix x, matrix y) { - for(const char* T : {"int", "uint"}) + __target_switch { -}}}} + case hlsl: __intrinsic_asm "fmod"; + default: + MATRIX_MAP_BINARY(T, N, M, fmod, x, y); + } +} -[ForceInline] -__glsl_version(430) -[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)] -void Interlocked$(atomicOp.slangCallSuffix)(__ref $(T) dest, $(T) value) +/// Extract the fractional part of a floating-point number. +/// @category math +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T frac(T x) { - static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture"); __target_switch { - case hlsl: __hlslInterlocked_$(atomicOp.internalCallSuffix)(dest, value); - case cuda: __cudaInterlocked_$(atomicOp.internalCallSuffix)(dest, value); - case glsl: __glslInterlocked_$(atomicOp.internalCallSuffix)(dest, value); - case spirv: __spirvInterlocked_$(atomicOp.internalCallSuffix)(dest, value); - case metal: - if (__isTextureAccess(dest)) - { - if(__isTextureArrayAccess(dest)) - { - __metalImageInterlocked_$(atomicOp.internalCallSuffix)(__extractTextureFromTextureAccess(dest), - __extractCoordFromTextureAccess(dest), __extractArrayCoordFromTextureAccess(dest), vector<$(T), 4>(value)); - } - else - { - __metalImageInterlocked_$(atomicOp.internalCallSuffix)(__extractTextureFromTextureAccess(dest), - __extractCoordFromTextureAccess(dest), vector<$(T), 4>(value)); - } - } - else - { - __metalInterlocked_$(atomicOp.internalCallSuffix)(__getMetalAtomicRef(dest), value); - } - return; + case cpp: __intrinsic_asm "$P_frac($0)"; + case cuda: __intrinsic_asm "$P_frac($0)"; + case glsl: __intrinsic_asm "fract"; + case hlsl: __intrinsic_asm "frac"; + case metal: __intrinsic_asm "fract"; + case spirv: return spirv_asm { + OpExtInst $$T result glsl450 Fract $x + }; + case wgsl: __intrinsic_asm "fract"; } } -[ForceInline] -__glsl_version(430) -[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)] -void Interlocked$(atomicOp.slangCallSuffix)(__ref $(T) dest, $(T) value, out $(T) original_value) +__generic +[__readNone] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector frac(vector x) { - static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to a scalar texture or non-texture"); __target_switch { - case hlsl: __hlslInterlocked_$(atomicOp.internalCallSuffix)(dest, value, original_value); - case cuda: __cudaInterlocked_$(atomicOp.internalCallSuffix)(dest, value, original_value); - case glsl: __glslInterlocked_$(atomicOp.internalCallSuffix)(dest, value, original_value); - case spirv: __spirvInterlocked_$(atomicOp.internalCallSuffix)(dest, value, original_value); - case metal: - if (__isTextureAccess(dest)) - if(__isTextureArrayAccess(dest)) - { - __metalImageInterlocked_$(atomicOp.internalCallSuffix)(__extractTextureFromTextureAccess(dest), - __extractCoordFromTextureAccess(dest), __extractArrayCoordFromTextureAccess(dest), vector<$(T),4>(value), original_value); - } - else - { - __metalImageInterlocked_$(atomicOp.internalCallSuffix)(__extractTextureFromTextureAccess(dest), - __extractCoordFromTextureAccess(dest), vector<$(T),4>(value), original_value); - } - else - __metalInterlocked_$(atomicOp.internalCallSuffix)(__getMetalAtomicRef(dest), value, original_value); - return; + case glsl: __intrinsic_asm "fract"; + case hlsl: __intrinsic_asm "frac"; + case metal: __intrinsic_asm "fract"; + case spirv: return spirv_asm { + OpExtInst $$vector result glsl450 Fract $x + }; + case wgsl: __intrinsic_asm "fract"; + default: + VECTOR_MAP_UNARY(T, N, frac, x); } } -${{{{ - } // for(const char* T : {"int64_t", "uint64_t"}) -}}}} +__generic +[__readNone] +matrix frac(matrix x) +{ + MATRIX_MAP_UNARY(T, N, M, frac, x); +} +/// Extract the fractional part of a floating-point number. +/// @category math +__generic +[__readNone] [ForceInline] -void Interlocked$(atomicOp.slangCallSuffix)(__ref uint dest, int value) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T fract(T x) { - Interlocked$(atomicOp.slangCallSuffix)(dest, (uint)value); + return frac(x); } -${{{{ -} // for (SlangAtomicOperationInfo atomicOp : slangAtomicOperationInfo) -}}}} - -${{{{ -for(const char* T : {"int64_t", "uint64_t"}) +__generic +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector fract(vector x) { -}}}} -/// @category atomic Atomic functions + return frac(x); +} + +/// Split float into mantissa and exponent. +/// @category math +__generic +[__readNone] [ForceInline] -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda_metal)] -void InterlockedAdd(__ref $(T) dest, $(T) value) -{ +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +T frexp(T x, out int exp) +{ __target_switch { - case hlsl: __hlslInterlocked_add(dest, value); - case cuda: __cudaInterlocked_add(dest, value); - case glsl: - __requireGLSLExtension("GL_EXT_shader_atomic_int64"); - __glslInterlocked_add(dest, value); - case spirv: - spirv_asm - { - OpCapability Int64Atomics; - result:$$$(T) = OpAtomicIAdd &dest Device None $value; - }; + case cpp: __intrinsic_asm "$P_frexp($0, $1)"; + case cuda: __intrinsic_asm "$P_frexp($0, $1)"; + case glsl: __intrinsic_asm "frexp"; + case hlsl: __intrinsic_asm "frexp"; + case metal: __intrinsic_asm "frexp($0, *($1))"; + case spirv: return spirv_asm { + result:$$T = OpExtInst glsl450 Frexp $x &exp + }; + case wgsl: + T fract; + __wgsl_frexp(x, fract, exp); + return fract; } } +__generic +[__readNone] [ForceInline] -void InterlockedAdd(__ref $(T) dest, $(T) value, out $(T) original_value) +[require(wgsl)] +void __wgsl_frexp(T x, out T fract, out int exp) { - __target_switch - { - case hlsl: __hlslInterlocked_add(dest, value, original_value); - case cuda: __cudaInterlocked_add(dest, value, original_value); - case glsl: - __requireGLSLExtension("GL_EXT_shader_atomic_int64"); - __glslInterlocked_add(dest, value, original_value); - case spirv: - spirv_asm - { - OpCapability Int64Atomics; - %origin:$$$(T) = OpAtomicIAdd &dest Device None $value; - OpStore &original_value %origin - }; - } + __intrinsic_asm "{ var s = frexp($0); ($1) = s.fract; ($2) = s.exp; }"; } -/// @category atomic +__generic +[__readNone] [ForceInline] -void InterlockedAnd(__ref $(T) dest, $(T) value) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +vector frexp(vector x, out vector exp) { __target_switch { - case hlsl: __hlslInterlocked_and(dest, value); + case glsl: __intrinsic_asm "frexp"; + case hlsl: __intrinsic_asm "frexp"; + case metal: __intrinsic_asm "frexp($0, *($1))"; + case spirv: return spirv_asm { + result:$$vector = OpExtInst glsl450 Frexp $x &exp + }; + case wgsl: + vector fract; + __wgsl_frexp(x, fract, exp); + return fract; + default: + VECTOR_MAP_BINARY(T, N, frexp, x, exp); } } +__generic +[__readNone] +[ForceInline] +[require(wgsl)] +void __wgsl_frexp(vector x, out vector fract, out vector exp) +{ + __intrinsic_asm "{ var s = frexp($0); ($1) = s.fract; ($2) = s.exp; }"; +} + +__generic +[__readNone] [ForceInline] -void InterlockedAnd(__ref $(T) dest, $(T) value, out $(T) original_value) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, sm_4_0_version)] +matrix frexp(matrix x, out matrix exp) { __target_switch { - case hlsl: __hlslInterlocked_and(dest, value, original_value); + case hlsl: __intrinsic_asm "frexp"; + default: + MATRIX_MAP_BINARY(T, N, M, frexp, x, exp); } } -/// @category atomic -[ForceInline] -void InterlockedCompareExchange(__ref $(T) dest, $(T) compare_value, $(T) value) +/// Texture filter width. +/// @category derivative +__generic +[__readNone] +[require(glsl_hlsl_metal_spirv_wgsl, fragmentprocessing)] +T fwidth(T x) { + __requireComputeDerivative(); __target_switch { - case hlsl: __hlslInterlocked_compare_exchange(dest, compare_value, value); + case hlsl: + __intrinsic_asm "fwidth($0)"; + case glsl: + __intrinsic_asm "fwidth($0)"; + case metal: + __intrinsic_asm "fwidth($0)"; + case spirv: + return spirv_asm + { + OpFwidth $$T result $x; + }; + case wgsl: + __intrinsic_asm "fwidth($0)"; } } -[ForceInline] -void InterlockedCompareExchange(__ref $(T) dest, $(T) compare_value, $(T) value, out $(T) original_value) +__generic +[__readNone] +[require(glsl_hlsl_spirv_wgsl, fragmentprocessing)] +vector fwidth(vector x) { + __requireComputeDerivative(); __target_switch { - case hlsl: __hlslInterlocked_compare_exchange(dest, compare_value, value, original_value); + case hlsl: + __intrinsic_asm "fwidth($0)"; + case glsl: + __intrinsic_asm "fwidth($0)"; + case spirv: + return spirv_asm + { + OpFwidth $$vector result $x; + }; + case wgsl: + __intrinsic_asm "fwidth($0)"; } } -[ForceInline] -void InterlockedCompareStore(__ref $(T) dest, $(T) compare_value, $(T) value); +__generic +[__readNone] +[require(glsl_hlsl_spirv, fragmentprocessing)] +matrix fwidth(matrix x) { __target_switch { - case hlsl: __intrinsic_asm "InterlockedCompareStore"; + case hlsl: + __intrinsic_asm "fwidth($0)"; + default: + MATRIX_MAP_UNARY(T, N, M, fwidth, x); } } -/// @category atomic -[ForceInline] -void InterlockedExchange(__ref $(T) dest, $(T) value) +__intrinsic_op($(kIROp_GetPerVertexInputArray)) +Array __GetPerVertexInputArray(T attribute); + +/// Get the value of a vertex attribute at a specific vertex. +/// +/// The `GetAttributeAtVertex()` function can be used in a fragment shader +/// to get the value of the given `attribute` at the vertex of the primitive +/// that corresponds to the given `vertexIndex`. +/// +/// Note that the `attribute` must have been a declared varying input to +/// the fragment shader with the `nointerpolation` modifier. +/// +/// This function can be applied to scalars, vectors, and matrices of +/// built-in scalar types. +/// +__generic +[__readNone] +__glsl_version(450) +__glsl_extension(GL_EXT_fragment_shader_barycentric) +[require(glsl_hlsl_spirv, getattributeatvertex)] +[KnownBuiltin("GetAttributeAtVertex")] +[__unsafeForceInlineEarly] +T GetAttributeAtVertex(T attribute, uint vertexIndex) { __target_switch { - case hlsl: __intrinsic_asm "InterlockedExchange"; + case hlsl: + __intrinsic_asm "GetAttributeAtVertex"; + case glsl: + case spirv: + return __GetPerVertexInputArray(attribute)[vertexIndex]; } } -[ForceInline] -void InterlockedExchange(__ref $(T) dest, $(T) value, out $(T) original_value) +/// Get the value of a vertex attribute at a specific vertex. +/// +/// The `GetAttributeAtVertex()` function can be used in a fragment shader +/// to get the value of the given `attribute` at the vertex of the primitive +/// that corresponds to the given `vertexIndex`. +/// +/// Note that the `attribute` must have been a declared varying input to +/// the fragment shader with the `nointerpolation` modifier. +/// +/// This function can be applied to scalars, vectors, and matrices of +/// built-in scalar types. +/// +__generic +[__readNone] +__glsl_version(450) +__glsl_extension(GL_EXT_fragment_shader_barycentric) +[require(glsl_hlsl_spirv, getattributeatvertex)] +vector GetAttributeAtVertex(vector attribute, uint vertexIndex) { __target_switch { - case hlsl: __intrinsic_asm "InterlockedExchange"; + case hlsl: + __intrinsic_asm "GetAttributeAtVertex"; + case glsl: + __intrinsic_asm "$0[$1]"; + case spirv: + return spirv_asm { + %_ptr_Input_vectorT = OpTypePointer Input $$vector; + %addr = OpAccessChain %_ptr_Input_vectorT $attribute $vertexIndex; + result:$$vector = OpLoad %addr; + }; } } -/// @category atomic -[ForceInline] -void InterlockedMax(__ref $(T) dest, $(T) value) +/// Get the value of a vertex attribute at a specific vertex. +/// +/// The `GetAttributeAtVertex()` function can be used in a fragment shader +/// to get the value of the given `attribute` at the vertex of the primitive +/// that corresponds to the given `vertexIndex`. +/// +/// Note that the `attribute` must have been a declared varying input to +/// the fragment shader with the `nointerpolation` modifier. +/// +/// This function can be applied to scalars, vectors, and matrices of +/// built-in scalar types. +/// +__generic +[__readNone] +__glsl_version(450) +__glsl_extension(GL_EXT_fragment_shader_barycentric) +[require(glsl_hlsl_spirv, getattributeatvertex)] +matrix GetAttributeAtVertex(matrix attribute, uint vertexIndex) { __target_switch { - case hlsl: __intrinsic_asm "InterlockedMax"; + case hlsl: + __intrinsic_asm "GetAttributeAtVertex"; + case glsl: + __intrinsic_asm "$0[$1]"; + case spirv: + return spirv_asm { + %_ptr_Input_matrixT = OpTypePointer Input $$matrix; + %addr = OpAccessChain %_ptr_Input_matrixT $attribute $vertexIndex; + result:$$matrix = OpLoad %addr; + }; } } -[ForceInline] -void InterlockedMax(__ref $(T) dest, $(T) value, out $(T) original_value) +// Get number of samples in render target +[__readNone] +[require(hlsl, sm_4_0)] +[require(metal)] +uint GetRenderTargetSampleCount() { __target_switch { - case hlsl: __intrinsic_asm "InterlockedMax"; + case hlsl: __intrinsic_asm "GetRenderTargetSampleCount"; + case metal: __intrinsic_asm "get_num_samples"; } } -/// @category atomic -[ForceInline] -void InterlockedMin(__ref $(T) dest, $(T) value) +// Get position of given sample +[__readNone] +[require(hlsl, sm_4_0)] +[require(metal)] +float2 GetRenderTargetSamplePosition(int Index) { __target_switch { - case hlsl: __intrinsic_asm "InterlockedMin"; + case hlsl: __intrinsic_asm "GetRenderTargetSamplePosition"; + case metal: __intrinsic_asm "get_sample_position"; } } -[ForceInline] -void InterlockedMin(__ref $(T) dest, $(T) value, out $(T) original_value) +/// Group memory barrier. Ensures that all memory accesses in the group are visible to all threads in the group. +/// @category barrier +__glsl_extension(GL_KHR_memory_scope_semantics) +[require(cuda_glsl_hlsl_metal_spirv_wgsl, memorybarrier)] +void GroupMemoryBarrier() { __target_switch { - case hlsl: __intrinsic_asm "InterlockedMin"; + case glsl: __intrinsic_asm "memoryBarrier(gl_ScopeWorkgroup, gl_StorageSemanticsShared, gl_SemanticsAcquireRelease)"; + case hlsl: __intrinsic_asm "GroupMemoryBarrier"; + case cuda: __intrinsic_asm "__threadfence_block"; + case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_threadgroup)"; + case spirv: + spirv_asm + { + OpMemoryBarrier Workgroup AcquireRelease|WorkgroupMemory + }; + case wgsl: __intrinsic_asm "workgroupBarrier"; } } -/// @category atomic -[ForceInline] -void InterlockedOr(__ref $(T) dest, $(T) value) +[require(cuda_glsl_hlsl_metal_spirv, memorybarrier)] +void __subgroupBarrier() { __target_switch { - case hlsl: __intrinsic_asm "InterlockedOr"; + case glsl: __intrinsic_asm "subgroupBarrier"; + case hlsl: __intrinsic_asm "GroupMemoryBarrierWithGroupSync"; + case cuda: __intrinsic_asm "__syncthreads()"; + case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_threadgroup)"; + case spirv: + spirv_asm + { + OpControlBarrier Subgroup Subgroup AcquireRelease|WorkgroupMemory|ImageMemory|UniformMemory + }; } } -[ForceInline] -void InterlockedOr(__ref $(T) dest, $(T) value, out $(T) original_value) +/// Group memory barrier. Ensures that all memory accesses in the group are visible to all threads in the group. +/// @category barrier +__glsl_extension(GL_KHR_memory_scope_semantics) +[require(cuda_glsl_hlsl_metal_spirv_wgsl, memorybarrier)] +void GroupMemoryBarrierWithGroupSync() { __target_switch { - case hlsl: __intrinsic_asm "InterlockedOr"; + case glsl: __intrinsic_asm "controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup, gl_StorageSemanticsShared, gl_SemanticsAcquireRelease)"; + case hlsl: __intrinsic_asm "GroupMemoryBarrierWithGroupSync"; + case cuda: __intrinsic_asm "__syncthreads()"; + case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_threadgroup)"; + case spirv: + spirv_asm + { + OpControlBarrier Workgroup Workgroup AcquireRelease|WorkgroupMemory + }; + case wgsl: __intrinsic_asm "workgroupBarrier"; } } -/// @category atomic -[ForceInline] -void InterlockedXor(__ref $(T) dest, $(T) value) -{ - __target_switch - { - case hlsl: __intrinsic_asm "InterlockedXor"; - } -} +// Atomics -[ForceInline] -void InterlockedXor(__ref $(T) dest, $(T) value, out $(T) original_value) -{ - __target_switch - { - case hlsl: __intrinsic_asm "InterlockedXor"; - } -} +// Accepts an ImageSubscript +// Gets Texture used with ImageSubscript. +__generic +__intrinsic_op($(kIROp_ExtractTextureFromTextureAccess)) +TextureAccess* __extractTextureFromTextureAccess(__ref TextureAccess x); + +// Accepts an ImageSubscript +// Gets Coord from ImageSubscript. Swizzles out ArrayCoord if applicable +__generic +__intrinsic_op($(kIROp_ExtractCoordFromTextureAccess)) +uint __extractCoordFromTextureAccess(__ref TextureAccess x); + +// Accepts an ImageSubscript +// Gets ArrayCoord from ImageSubscript +__generic +__intrinsic_op($(kIROp_ExtractArrayCoordFromTextureAccess)) +uint __extractArrayCoordFromTextureAccess(__ref TextureAccess x); ${{{{ -} // for(const char* T : {"int64_t", "uint64_t"}) +// Generates code for: +// InterlockedAdd, InterlockedAnd, InterlockedOr, InterlockedXor, +// InterlockedMax, InterlockedMin, InterlockedExchange +struct SlangAtomicOperationInfo +{ + const char* slangCallSuffix; + const char* internalCallSuffix; + const char* interface; +}; + +SlangAtomicOperationInfo slangAtomicOperationInfo[7] = { + { "Add", "add", "IArithmeticAtomicable" }, + { "And", "and", "IArithmeticAtomicable" }, + { "Or", "or", "IArithmeticAtomicable" }, + { "Xor", "xor", "IArithmeticAtomicable" }, + { "Max", "max", "IArithmeticAtomicable" }, + { "Min", "min", "IArithmeticAtomicable" }, + { "Exchange", "exchange", "IAtomicable" }, +}; + +for (SlangAtomicOperationInfo atomicOp : slangAtomicOperationInfo) +{ }}}} -/// @category atomic +/// Perform an atomic $(atomicOp.internalCallSuffix) operation on `dest`. +/// @param T The type of the value to perform the atomic operation on. +/// @param dest The value to perform the atomic operation on. +/// @param value The operand to the atomic operation. +/// @param original_value The value of `dest` before the operation. +/// @remarks When targeting HLSL, it is invalid to call this function with `T` being a floating-point type, since +/// HLSL does not allow atomic operations on floating point types. For `InterlockedAdd`, consider using +/// `RWByteAddressBuffer.InterlockedAddF32` or `RWByteAddressBuffer.InterlockedAddF16` instead when NVAPI is available. +/// On SPIR-V (Vulkan), all integer and floating point types are supported. +/// On Metal and WGSL, all floating-point types are not supported. +/// @category atomic Atomic functions [ForceInline] __glsl_version(430) [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)] -void InterlockedCompareExchange(__ref int dest, int compare_value, int value, out int original_value) +void Interlocked$(atomicOp.slangCallSuffix)(__ref T dest, T value) { - static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture"); - __target_switch - { - case hlsl: __hlslInterlocked_compare_exchange(dest, compare_value, value, original_value); - case glsl: __glslInterlocked_compare_exchange(dest, compare_value, value, original_value); - case cuda: __cudaInterlocked_compare_exchange(dest, compare_value, value, original_value); - case spirv: __spirvInterlocked_compare_exchange(dest, compare_value, value, original_value); - case metal: - if (__isTextureAccess(dest)) - { - vector vec_compare_value = vector(compare_value); - if(__isTextureArrayAccess(dest)) - { - __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), - __extractCoordFromTextureAccess(dest), __extractArrayCoordFromTextureAccess(dest), vec_compare_value, vector(value), original_value); - } - else - { - __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), - __extractCoordFromTextureAccess(dest), vec_compare_value, vector(value), original_value); - } - } - else - { - __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value, original_value); - } - return; - } + __atomic_$(atomicOp.internalCallSuffix)(dest, value); } [ForceInline] __glsl_version(430) [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)] -void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, out uint original_value) +void Interlocked$(atomicOp.slangCallSuffix)(__ref T dest, T value, out T original_value) { - static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture"); - __target_switch - { - case hlsl: __hlslInterlocked_compare_exchange(dest, compare_value, value, original_value); - case cuda: __cudaInterlocked_compare_exchange(dest, compare_value, value, original_value); - case glsl: __glslInterlocked_compare_exchange(dest, compare_value, value, original_value); - case spirv: __spirvInterlocked_compare_exchange(dest, compare_value, value, original_value); - case metal: - if (__isTextureAccess(dest)) - { - vector vec_compare_value = vector(compare_value); - if(__isTextureArrayAccess(dest)) - { - __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), - __extractCoordFromTextureAccess(dest), __extractArrayCoordFromTextureAccess(dest), vec_compare_value, vector(value), original_value); - } - else - { - __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), - __extractCoordFromTextureAccess(dest), vec_compare_value, vector(value), original_value); - } - } - else - { - __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value, original_value); - } - return; - } + original_value = __atomic_$(atomicOp.internalCallSuffix)(dest, value); } -/// @category atomic [ForceInline] -void InterlockedCompareExchangeFloatBitwise(__ref float dest, float compare_value, float value) +__glsl_version(430) +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)] +void Interlocked$(atomicOp.slangCallSuffix)(__ref uint dest, int value) { - static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture"); - __target_switch - { - case hlsl: __hlslInterlocked_compare_exchange_float_bitwise(dest, compare_value, value); - case metal: - static_assert(!__isTextureAccess(dest), "float atomic texture operations are disallowed with Metal target's"); - __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value); - return; - } + __atomic_$(atomicOp.internalCallSuffix)(dest, (uint)value); } -[ForceInline] -void InterlockedCompareExchangeFloatBitwise(__ref float dest, float compare_value, float value, out float original_value) -{ - static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture"); - __target_switch - { - case hlsl: __hlslInterlocked_compare_exchange_float_bitwise(dest, compare_value, value, original_value); - case metal: - static_assert(!__isTextureAccess(dest), "float atomic texture operations are disallowed with Metal target's"); - __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value, original_value); - return; - } -} +${{{{ +} // for (SlangAtomicOperationInfo atomicOp : slangAtomicOperationInfo) +}}}} +/// Perform an atomic compare and exchange operation on `dest`. +/// @param T The type of the value to perform the atomic operation on. +/// @param dest The value to perform the atomic operation on. +/// @param compare_value The value to compare `dest` with. +/// @param value The value to store into `dest` if the compare result is equal. +/// @param original_value The value of `dest` before the operation. +/// @remarks When targeting HLSL, a call to this function with `T` being `float` will translate to a call to +/// `InterlockedCompareExchangeFloatBitwise`, which means the comparison is done as a bitwise comparison. +/// +/// On SPIR-V (Vulkan), this function maps to `OpAtomicCompareExchange`. +/// +/// On Metal and WGSL, all floating-point types are not supported. +/// +/// On CUDA, this function maps to `atomicCAS`. /// @category atomic [ForceInline] -__glsl_version(430) [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)] -void InterlockedCompareStore(__ref int dest, int compare_value, int value) +void InterlockedCompareExchange(__ref T dest, T compare_value, T value, out T original_value) { - __target_switch - { - case hlsl: __intrinsic_asm "InterlockedCompareStore"; - case glsl: __intrinsic_asm "$atomicCompSwap($A, $1, $2)"; - case cuda: __intrinsic_asm "atomicCAS($0, $1, $2)"; - case spirv: - { - spirv_asm - { - result:$$int = OpAtomicCompareExchange &dest Device None None $value $compare_value; - }; - return; - } - case metal: - { - if (__isTextureAccess(dest)) - { - vector vec_compare_value = vector(compare_value); - if(__isTextureArrayAccess(dest)) - { - __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), - __extractCoordFromTextureAccess(dest), __extractArrayCoordFromTextureAccess(dest), vec_compare_value, vector(value)); - } - else - { - __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), - __extractCoordFromTextureAccess(dest), vec_compare_value, vector(value)); - } - } - else - { - __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value); - } - return; - } - } + original_value = __atomic_compare_exchange(dest, compare_value, value); } +/// Perform an atomic compare and exchange operation on `dest`. +/// @param T The type of the value to perform the atomic operation on. +/// @param dest The value to perform the atomic operation on. +/// @param compare_value The value to compare `dest` with. +/// @param value The value to store into `dest` if the compare result is equal. +/// @param original_value The value of `dest` before the operation. +/// @remarks When targeting HLSL, a call to this function will translate to a call to +/// `InterlockedCompareExchangeFloatBitwise`, which means the comparison is done as a bitwise comparison. +/// +/// On SPIR-V (Vulkan), this function maps to `OpAtomicCompareExchange`. +/// +/// On Metal and WGSL, this function is not available. +/// +/// On CUDA, this function maps to `atomicCAS`. +/// @category atomic [ForceInline] -__glsl_version(430) -[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)] -void InterlockedCompareStore(__ref uint dest, uint compare_value, uint value) +void InterlockedCompareExchangeFloatBitwise(__ref float dest, float compare_value, float value) { - __target_switch - { - case hlsl: __intrinsic_asm "InterlockedCompareStore"; - case glsl: __intrinsic_asm "$atomicCompSwap($A, $1, $2)"; - case cuda: __intrinsic_asm "atomicCAS((int*)$0, $1, $2)"; - case spirv: - spirv_asm - { - result:$$uint = OpAtomicCompareExchange &dest Device None None $value $compare_value; - }; - case metal: - if (__isTextureAccess(dest)) - { - vector vec_compare_value = vector(compare_value); - if(__isTextureArrayAccess(dest)) - { - __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), - __extractCoordFromTextureAccess(dest), __extractArrayCoordFromTextureAccess(dest), vec_compare_value, vector(value)); - } - else - { - __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), - __extractCoordFromTextureAccess(dest), vec_compare_value, vector(value)); - } - } - else - { - __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value); - } - return; - } + __atomic_compare_exchange(dest, compare_value, value); } -/// @category atomic [ForceInline] -void InterlockedCompareStoreFloatBitwise(__ref float dest, float compare_value, float value) +void InterlockedCompareExchangeFloatBitwise(__ref float dest, float compare_value, float value, out float original_value) { - __target_switch - { - case hlsl: __intrinsic_asm "InterlockedCompareStoreFloatBitwise"; - } + original_value = __atomic_compare_exchange(dest, compare_value, value); } +/// Perform an atomic compare and store operation on `dest`. +/// @param T The type of the value to perform the atomic operation on. +/// @param dest The value to perform the atomic operation on. +/// @param compare_value The value to compare `dest` with. +/// @param value The value to store into `dest` if the compare result is equal. +/// @remarks When targeting HLSL, a call to this function with `T` being `float` will translate to a call to +/// `InterlockedCompareStoreFloatBitwise`, which means the comparison is done as a bitwise comparison. +/// +/// On SPIR-V (Vulkan), this function maps to `OpAtomicCompareExchange`. +/// +/// On Metal and WGSL, this function is not available. +/// +/// On CUDA, this function maps to `atomicCAS`. /// @category atomic [ForceInline] -void InterlockedExchange(__ref float dest, float value) +__glsl_version(430) +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)] +void InterlockedCompareStore(__ref T dest, T compare_value, T value) { - static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture"); __target_switch { - case hlsl: __hlslInterlocked_exchange(dest, value); - case metal: - static_assert(!__isTextureAccess(dest), "'float' atomic texture operations are disallowed with Metal target's"); - __metalInterlocked_exchange(__getMetalAtomicRef(dest), value); + case hlsl: __intrinsic_asm "InterlockedCompareStore"; + default: + __atomic_compare_exchange(dest, compare_value, value); return; } } +/// Perform an atomic compare and store operation on `dest`. +/// @param T The type of the value to perform the atomic operation on. +/// @param dest The value to perform the atomic operation on. +/// @param compare_value The value to compare `dest` with. +/// @param value The value to store into `dest` if the compare result is equal. +/// @remarks When targeting HLSL, a call to this function will translate to a call to +/// `InterlockedCompareStoreFloatBitwise`, which means the comparison is done as a bitwise comparison. +/// +/// On SPIR-V (Vulkan), this function maps to `OpAtomicCompareExchange`. +/// +/// On Metal and WGSL, this function is not available. +/// +/// On CUDA, this function maps to `atomicCAS`. +/// @category atomic [ForceInline] -void InterlockedExchange(__ref float dest, float value, out float original_value) +void InterlockedCompareStoreFloatBitwise(__ref T dest, T compare_value, T value) { - static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture"); __target_switch { - case hlsl: __hlslInterlocked_exchange(dest, value, original_value); - case metal: - static_assert(!__isTextureAccess(dest), "'float' atomic texture operations are disallowed with Metal target's"); - __metalInterlocked_exchange(__getMetalAtomicRef(dest), value, original_value); + case hlsl: __intrinsic_asm "InterlockedCompareStoreFloatBitwise"; + default: + __atomic_compare_exchange(dest, compare_value, value); return; } } - /// Test if a floating-point value finite. /// @category math __generic @@ -21245,13 +19560,13 @@ extension _TexturegetOperand(0); + emitOperand(base, outerPrec); + m_writer->emit(".asStructuredBuffer<"); + emitType(as(inst->getDataType())->getElementType()); + m_writer->emit(">()"); + } + break; + case kIROp_RWStructuredBufferStore: { auto base = inst->getOperand(0); diff --git a/source/slang/slang-emit-c-like.h b/source/slang/slang-emit-c-like.h index 3cccad9e6c..f0d703b40e 100644 --- a/source/slang/slang-emit-c-like.h +++ b/source/slang/slang-emit-c-like.h @@ -260,7 +260,6 @@ class CLikeSourceEmitter: public SourceEmitterBase bool hasExplicitConstantBufferOffset(IRInst* cbufferType); bool isSingleElementConstantBuffer(IRInst* cbufferType); bool shouldForceUnpackConstantBufferElements(IRInst* cbufferType); - // // Expressions // diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp index 81bcafeb3b..7d104ff1b2 100644 --- a/source/slang/slang-emit-cuda.cpp +++ b/source/slang/slang-emit-cuda.cpp @@ -515,7 +515,17 @@ bool CUDASourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { emitInstResultDecl(inst); m_writer->emit("atomicAdd("); + bool needCloseTypeCast = false; + if (inst->getDataType()->getOp() == kIROp_Int64Type) + { + m_writer->emit("(unsigned long long*)("); + needCloseTypeCast = true; + } emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + if (needCloseTypeCast) + { + m_writer->emit(")"); + } m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); m_writer->emit(");\n"); @@ -525,7 +535,17 @@ bool CUDASourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { emitInstResultDecl(inst); m_writer->emit("atomicAdd("); + bool needCloseTypeCast = false; + if (inst->getDataType()->getOp() == kIROp_Int64Type) + { + m_writer->emit("(unsigned long long*)("); + needCloseTypeCast = true; + } emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + if (needCloseTypeCast) + { + m_writer->emit(")"); + } m_writer->emit(", -("); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); m_writer->emit("));\n"); diff --git a/source/slang/slang-emit-glsl.cpp b/source/slang/slang-emit-glsl.cpp index ca55696026..7f8bc14b41 100644 --- a/source/slang/slang-emit-glsl.cpp +++ b/source/slang/slang-emit-glsl.cpp @@ -2153,8 +2153,50 @@ bool GLSLSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu return false; } +static IRImageSubscript* isTextureAccess(IRInst* inst) +{ + return as(getRootAddr(inst->getOperand(0))); +} + +void GLSLSourceEmitter::emitAtomicImageCoord(IRImageSubscript* inst) +{ + emitOperand(inst->getImage(), getInfo(EmitOp::General)); + m_writer->emit(", "); + if (auto vecType = as(inst->getCoord()->getDataType())) + { + m_writer->emit("ivec"); + m_writer->emit(getIntVal(vecType->getElementCount())); + } + else + { + m_writer->emit("int"); + } + m_writer->emit("("); + emitOperand(inst->getCoord(), getInfo(EmitOp::General)); + m_writer->emit(")"); + if (inst->hasSampleCoord()) + { + m_writer->emit(", "); + emitOperand(inst->getSampleCoord(), getInfo(EmitOp::General)); + } +} + bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { + auto requireAtomicExtIfNeeded = [&]() + { + if (isFloatingType(inst->getDataType())) + { + _requireGLSLExtension(toSlice("GL_EXT_shader_atomic_float")); + } + if (isIntegralType(inst->getDataType())) + { + if (getIntTypeInfo(inst->getDataType()).width == 64) + { + _requireGLSLExtension(toSlice("GL_EXT_shader_atomic_int64")); + } + } + }; switch (inst->getOp()) { case kIROp_StructuredBufferGetDimensions: @@ -2176,24 +2218,52 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) case kIROp_AtomicLoad: { emitInstResultDecl(inst); - emitDereferenceOperand(inst->getOperand(0), getInfo(EmitOp::General)); + if (auto imageSubscript = isTextureAccess(inst)) + { + m_writer->emit("imageLoad("); + emitAtomicImageCoord(imageSubscript); + m_writer->emit(")"); + } + else + { + emitDereferenceOperand(inst->getOperand(0), getInfo(EmitOp::General)); + } m_writer->emit(";\n"); return true; } case kIROp_AtomicStore: { - emitInstResultDecl(inst); - emitDereferenceOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit(" = "); - emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); - m_writer->emit(";\n"); + if (auto imageSubscript = isTextureAccess(inst)) + { + m_writer->emit("imageStore("); + emitAtomicImageCoord(imageSubscript); + m_writer->emit(", "); + emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); + m_writer->emit(")"); + } + else + { + emitDereferenceOperand(inst->getOperand(0), getInfo(EmitOp::General)); + m_writer->emit(" = "); + emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); + m_writer->emit(";\n"); + } return true; } case kIROp_AtomicExchange: { + requireAtomicExtIfNeeded(); emitInstResultDecl(inst); - m_writer->emit("atomicExchange("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + if (auto imageSubscript = isTextureAccess(inst)) + { + m_writer->emit("imageAtomicExchange("); + emitAtomicImageCoord(imageSubscript); + } + else + { + m_writer->emit("atomicExchange("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + } m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); m_writer->emit(");\n"); @@ -2201,9 +2271,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) } case kIROp_AtomicCompareExchange: { + requireAtomicExtIfNeeded(); + emitInstResultDecl(inst); - m_writer->emit("atomicCompSwap("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + if (auto imageSubscript = isTextureAccess(inst)) + { + m_writer->emit("imageAtomicCompSwap("); + emitAtomicImageCoord(imageSubscript); + } + else + { + m_writer->emit("atomicCompSwap("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + } m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); m_writer->emit(", "); @@ -2213,9 +2293,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) } case kIROp_AtomicAdd: { + requireAtomicExtIfNeeded(); + emitInstResultDecl(inst); - m_writer->emit("atomicAdd("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + if (auto imageSubscript = isTextureAccess(inst)) + { + m_writer->emit("imageAtomicAdd("); + emitAtomicImageCoord(imageSubscript); + } + else + { + m_writer->emit("atomicAdd("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + } m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); m_writer->emit(");\n"); @@ -2223,9 +2313,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) } case kIROp_AtomicSub: { + requireAtomicExtIfNeeded(); + emitInstResultDecl(inst); - m_writer->emit("atomicAdd("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + if (auto imageSubscript = isTextureAccess(inst)) + { + m_writer->emit("imageAtomicAdd("); + emitAtomicImageCoord(imageSubscript); + } + else + { + m_writer->emit("atomicAdd("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + } m_writer->emit(", -("); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); m_writer->emit("));\n"); @@ -2233,9 +2333,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) } case kIROp_AtomicAnd: { + requireAtomicExtIfNeeded(); + emitInstResultDecl(inst); - m_writer->emit("atomicAnd("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + if (auto imageSubscript = isTextureAccess(inst)) + { + m_writer->emit("imageAtomicAnd("); + emitAtomicImageCoord(imageSubscript); + } + else + { + m_writer->emit("atomicAnd("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + } m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); m_writer->emit(");\n"); @@ -2243,9 +2353,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) } case kIROp_AtomicOr: { + requireAtomicExtIfNeeded(); + emitInstResultDecl(inst); - m_writer->emit("atomicOr("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + if (auto imageSubscript = isTextureAccess(inst)) + { + m_writer->emit("imageAtomicOr("); + emitAtomicImageCoord(imageSubscript); + } + else + { + m_writer->emit("atomicOr("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + } m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); m_writer->emit(");\n"); @@ -2253,9 +2373,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) } case kIROp_AtomicXor: { + requireAtomicExtIfNeeded(); + emitInstResultDecl(inst); - m_writer->emit("atomicXor("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + if (auto imageSubscript = isTextureAccess(inst)) + { + m_writer->emit("imageAtomicXor("); + emitAtomicImageCoord(imageSubscript); + } + else + { + m_writer->emit("atomicXor("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + } m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); m_writer->emit(");\n"); @@ -2263,9 +2393,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) } case kIROp_AtomicMin: { + requireAtomicExtIfNeeded(); + emitInstResultDecl(inst); - m_writer->emit("atomicMin("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + if (auto imageSubscript = isTextureAccess(inst)) + { + m_writer->emit("imageAtomicMin("); + emitAtomicImageCoord(imageSubscript); + } + else + { + m_writer->emit("atomicMin("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + } m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); m_writer->emit(");\n"); @@ -2273,9 +2413,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) } case kIROp_AtomicMax: { + requireAtomicExtIfNeeded(); + emitInstResultDecl(inst); - m_writer->emit("atomicMax("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + if (auto imageSubscript = isTextureAccess(inst)) + { + m_writer->emit("imageAtomicMax("); + emitAtomicImageCoord(imageSubscript); + } + else + { + m_writer->emit("atomicMax("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + } m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); m_writer->emit(");\n"); @@ -2283,9 +2433,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) } case kIROp_AtomicInc: { + requireAtomicExtIfNeeded(); + emitInstResultDecl(inst); - m_writer->emit("atomicAdd("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + if (auto imageSubscript = isTextureAccess(inst)) + { + m_writer->emit("imageAtomicAdd("); + emitAtomicImageCoord(imageSubscript); + } + else + { + m_writer->emit("atomicAdd("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + } m_writer->emit(", "); emitType(inst->getDataType()); m_writer->emit("(1)"); @@ -2294,9 +2454,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) } case kIROp_AtomicDec: { + requireAtomicExtIfNeeded(); + emitInstResultDecl(inst); - m_writer->emit("atomicAdd("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + if (auto imageSubscript = isTextureAccess(inst)) + { + m_writer->emit("imageAtomicAdd("); + emitAtomicImageCoord(imageSubscript); + } + else + { + m_writer->emit("atomicAdd("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + } m_writer->emit(", "); emitType(inst->getDataType()); m_writer->emit("(-1)"); diff --git a/source/slang/slang-emit-glsl.h b/source/slang/slang-emit-glsl.h index 8958c7608e..12ab60e46a 100644 --- a/source/slang/slang-emit-glsl.h +++ b/source/slang/slang-emit-glsl.h @@ -133,6 +133,8 @@ class GLSLSourceEmitter : public CLikeSourceEmitter void _emitSpecialFloatImpl(IRType* type, const char* valueExpr); + void emitAtomicImageCoord(IRImageSubscript* operand); + Dictionary> m_referencingEntryPoints; RefPtr m_glslExtensionTracker; diff --git a/source/slang/slang-emit-hlsl.cpp b/source/slang/slang-emit-hlsl.cpp index b45b4c5752..ae87fd6d52 100644 --- a/source/slang/slang-emit-hlsl.cpp +++ b/source/slang/slang-emit-hlsl.cpp @@ -498,6 +498,10 @@ void HLSLSourceEmitter::emitEntryPointAttributesImpl(IRFunc* irFunc, IREntryPoin bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { + auto diagnoseFloatAtommic = [&]() + { + getSink()->diagnose(inst, Diagnostics::unsupportedTargetIntrinsic, "floating point atomic operation"); + }; switch (inst->getOp()) { case kIROp_AtomicLoad: @@ -519,7 +523,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { emitType(inst->getDataType(), getName(inst)); m_writer->emit(";\n"); - m_writer->emit("InterlockedExchange("); + m_writer->emit("InterlockedExchange"); + m_writer->emit("("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); @@ -532,7 +537,10 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { emitType(inst->getDataType(), getName(inst)); m_writer->emit(";\n"); - m_writer->emit("InterlockedCompareExchange("); + m_writer->emit("InterlockedCompareExchange"); + if (inst->getDataType()->getOp() == kIROp_FloatType) + m_writer->emit("FloatBitwise"); + m_writer->emit("("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); @@ -547,7 +555,12 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { emitType(inst->getDataType(), getName(inst)); m_writer->emit(";\n"); - m_writer->emit("InterlockedAdd("); + if (inst->getDataType()->getOp() == kIROp_FloatType) + { + diagnoseFloatAtommic(); + } + m_writer->emit("InterlockedAdd"); + m_writer->emit("("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); @@ -560,7 +573,12 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { emitType(inst->getDataType(), getName(inst)); m_writer->emit(";\n"); - m_writer->emit("InterlockedAdd("); + if (inst->getDataType()->getOp() == kIROp_FloatType) + { + diagnoseFloatAtommic(); + } + m_writer->emit("InterlockedAdd"); + m_writer->emit("("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); m_writer->emit(", -("); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); @@ -573,7 +591,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { emitType(inst->getDataType(), getName(inst)); m_writer->emit(";\n"); - m_writer->emit("InterlockedAnd("); + m_writer->emit("InterlockedAnd"); + m_writer->emit("("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); @@ -586,7 +605,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { emitType(inst->getDataType(), getName(inst)); m_writer->emit(";\n"); - m_writer->emit("InterlockedOr("); + m_writer->emit("InterlockedOr"); + m_writer->emit("("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); @@ -599,7 +619,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { emitType(inst->getDataType(), getName(inst)); m_writer->emit(";\n"); - m_writer->emit("InterlockedXor("); + m_writer->emit("InterlockedXor"); + m_writer->emit("("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); @@ -612,7 +633,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { emitType(inst->getDataType(), getName(inst)); m_writer->emit(";\n"); - m_writer->emit("InterlockedMin("); + m_writer->emit("InterlockedMin"); + m_writer->emit("("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); @@ -625,7 +647,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { emitType(inst->getDataType(), getName(inst)); m_writer->emit(";\n"); - m_writer->emit("InterlockedMax("); + m_writer->emit("InterlockedMax"); + m_writer->emit("("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); @@ -638,7 +661,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { emitType(inst->getDataType(), getName(inst)); m_writer->emit(";\n"); - m_writer->emit("InterlockedAdd("); + m_writer->emit("InterlockedAdd"); + m_writer->emit("("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); m_writer->emit(", 1, "); m_writer->emit(getName(inst)); @@ -649,7 +673,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { emitType(inst->getDataType(), getName(inst)); m_writer->emit(";\n"); - m_writer->emit("InterlockedAdd("); + m_writer->emit("InterlockedAdd"); + m_writer->emit("("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); m_writer->emit(", -1, "); m_writer->emit(getName(inst)); diff --git a/source/slang/slang-emit-metal.cpp b/source/slang/slang-emit-metal.cpp index 2d5a7d56b2..abd4d670a0 100644 --- a/source/slang/slang-emit-metal.cpp +++ b/source/slang/slang-emit-metal.cpp @@ -260,8 +260,118 @@ void MetalSourceEmitter::emitMemoryOrderOperand(IRInst* inst) } } +static IRImageSubscript* isTextureAccess(IRInst* inst) +{ + return as(getRootAddr(inst->getOperand(0))); +} + +void MetalSourceEmitter::emitAtomicImageCoord(IRImageSubscript* inst) +{ + auto resourceType = as(inst->getImage()->getDataType()); + if (auto textureType = as(resourceType)) + { + if (as(textureType->getElementType())) + { + getSink()->diagnose(inst, Diagnostics::unsupportedTargetIntrinsic, "atomic operation on non-scalar texture"); + } + } + bool isArray = getIntVal(resourceType->getIsArrayInst()) != 0; + if (isArray) + { + emitOperand(inst->getCoord(), getInfo(EmitOp::Postfix)); + if (auto coordType = as(inst->getCoord()->getDataType())) + { + m_writer->emit("."); + const char* elements[] = { "x", "y", "z", "w" }; + for (IRIntegerValue i = 0; i < getIntVal(coordType->getElementCount()) - 1; i++) + m_writer->emit(elements[Math::Min(3, (int)i)]); + m_writer->emit(", "); + emitOperand(inst->getCoord(), getInfo(EmitOp::Postfix)); + m_writer->emit("."); + m_writer->emit(elements[Math::Min(3, (int)getIntVal(coordType->getElementCount()) - 1)]); + } + else + { + getSink()->diagnose(inst, Diagnostics::unsupportedTargetIntrinsic, "invalid image coordinate for atomic operation"); + } + } + else + { + emitOperand(inst->getCoord(), getInfo(EmitOp::General)); + } +} + +void MetalSourceEmitter::emitAtomicDestOperand(IRInst* inst) +{ + // If operand is already an atomic type, we can emit it + // as is. + auto ptrType = as(inst->getDataType()); + if (ptrType && as(ptrType->getValueType())) + { + emitOperand(inst, getInfo(EmitOp::General)); + return; + } + // Otherwise, we need to emit a cast. + m_writer->emit("((atomic_"); + emitType(inst->getDataType()); + m_writer->emit(")("); + emitOperand(inst, getInfo(EmitOp::General)); + m_writer->emit("))"); +} + +void MetalSourceEmitter::emitAtomicSrcOperand(bool isImage, IRInst* inst) +{ + if (!isImage) + { + emitOperand(inst, getInfo(EmitOp::General)); + return; + } + // If we are emitting a source operand for an atomic image operation, + // we need to convert it into a 4-vector. + m_writer->emit("vec<"); + emitType(inst->getDataType()); + m_writer->emit(", 4>("); + emitOperand(inst, getInfo(EmitOp::General)); + m_writer->emit(")"); +} + bool MetalSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) { + auto emitAtomicOp = [&](const char* imageFunc, const char* bufferFunc) + { + emitInstResultDecl(inst); + bool isImageOp = false; + if (auto imageSubscript = isTextureAccess(inst)) + { + emitOperand(imageSubscript->getImage(), getInfo(EmitOp::Postfix)); + m_writer->emit("."); + m_writer->emit(imageFunc); + m_writer->emit("("); + emitAtomicImageCoord(imageSubscript); + isImageOp = true; + } + else + { + m_writer->emit(bufferFunc); + m_writer->emit("("); + emitAtomicDestOperand(inst->getOperand(0)); + } + m_writer->emit(", "); + emitAtomicSrcOperand(isImageOp, inst->getOperand(1)); + if (!isImageOp) + { + m_writer->emit(", "); + emitMemoryOrderOperand(inst->getOperand(inst->getOperandCount() - 1)); + } + if (isImageOp) + m_writer->emit(").x;\n"); + else + m_writer->emit(");\n"); + }; + auto diagnoseFloatAtommic = [&]() + { + getSink()->diagnose(inst, Diagnostics::unsupportedTargetIntrinsic, "floating point atomic operation"); + }; switch (inst->getOp()) { case kIROp_discard: @@ -287,160 +397,216 @@ bool MetalSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) } case kIROp_AtomicLoad: { + if (isFloatingType(inst->getDataType())) + diagnoseFloatAtommic(); + emitInstResultDecl(inst); - m_writer->emit("atomic_load_explicit("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitMemoryOrderOperand(inst->getOperand(1)); - m_writer->emit(");\n"); + bool isImageOp = false; + if (auto imageSubscript = isTextureAccess(inst)) + { + emitOperand(imageSubscript->getImage(), getInfo(EmitOp::Postfix)); + m_writer->emit(".atomic_load("); + emitAtomicImageCoord(imageSubscript); + isImageOp = true; + } + else + { + m_writer->emit("atomic_load_explicit("); + emitAtomicDestOperand(inst->getOperand(0)); + } + if (!isImageOp) + { + m_writer->emit(", "); + emitMemoryOrderOperand(inst->getOperand(1)); + } + if (isImageOp) + m_writer->emit(").x;\n"); + else + m_writer->emit(");\n"); return true; } case kIROp_AtomicStore: { - m_writer->emit("atomic_store_explicit("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); + bool isImageOp = false; + if (auto imageSubscript = isTextureAccess(inst)) + { + emitOperand(imageSubscript->getImage(), getInfo(EmitOp::Postfix)); + m_writer->emit(".atomic_store("); + emitAtomicImageCoord(imageSubscript); + isImageOp = true; + } + else + { + m_writer->emit("atomic_store_explicit("); + emitAtomicDestOperand(inst->getOperand(0)); + } m_writer->emit(", "); - emitMemoryOrderOperand(inst->getOperand(2)); + emitAtomicSrcOperand(isImageOp, inst->getOperand(1)); + if (!isImageOp) + { + m_writer->emit(", "); + emitMemoryOrderOperand(inst->getOperand(2)); + } m_writer->emit(");\n"); return true; } case kIROp_AtomicExchange: { - emitInstResultDecl(inst); - m_writer->emit("atomic_exchange_explicit("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitMemoryOrderOperand(inst->getOperand(2)); - m_writer->emit(");\n"); + if (isFloatingType(inst->getDataType())) + diagnoseFloatAtommic(); + + emitAtomicOp("atomic_exchange", "atomic_exchange_explicit"); return true; } case kIROp_AtomicCompareExchange: { + if (isFloatingType(inst->getDataType())) + diagnoseFloatAtommic(); + + bool isImageOp = false; + auto imageSubscript = isTextureAccess(inst); + isImageOp = (imageSubscript != nullptr); + emitType(inst->getDataType(), getName(inst)); m_writer->emit(";\n{\n"); - emitType(inst->getDataType(), "_metal_cas_comparand"); + if (isImageOp) + m_writer->emit("vec<"); + emitType(inst->getDataType()); + if (isImageOp) + m_writer->emit(", 4>"); + m_writer->emit(" _metal_cas_comparand"); m_writer->emit(" = "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); m_writer->emit(";\n"); - - m_writer->emit(getName(inst)); - m_writer->emit(" = atomic_compare_exchange_weak_explicit("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + if (imageSubscript) + { + emitOperand(imageSubscript->getImage(), getInfo(EmitOp::Postfix)); + m_writer->emit(".atomic_compare_exchange_weak("); + emitAtomicImageCoord(imageSubscript); + } + else + { + m_writer->emit("atomic_compare_exchange_weak_explicit("); + emitAtomicDestOperand(inst->getOperand(0)); + } m_writer->emit(", &_metal_cas_comparand, "); - emitOperand(inst->getOperand(2), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitMemoryOrderOperand(inst->getOperand(3)); - m_writer->emit(", "); - emitMemoryOrderOperand(inst->getOperand(4)); - m_writer->emit(");\n}\n"); + emitAtomicSrcOperand(isImageOp, inst->getOperand(2)); + if (!isImageOp) + { + m_writer->emit(", "); + emitMemoryOrderOperand(inst->getOperand(3)); + m_writer->emit(", "); + emitMemoryOrderOperand(inst->getOperand(4)); + } + m_writer->emit(");\n"); + m_writer->emit(getName(inst)); + m_writer->emit(" = _metal_cas_comparand"); + if (isImageOp) + m_writer->emit(".x"); + m_writer->emit(";\n}\n"); return true; } case kIROp_AtomicAdd: { - emitInstResultDecl(inst); - m_writer->emit("atomic_fetch_add_explicit("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitMemoryOrderOperand(inst->getOperand(2)); - m_writer->emit(");\n"); + if (isFloatingType(inst->getDataType())) + diagnoseFloatAtommic(); + + emitAtomicOp("atomic_fetch_add", "atomic_fetch_add_explicit"); return true; } case kIROp_AtomicSub: { - emitInstResultDecl(inst); - m_writer->emit("atomic_fetch_sub_explicit("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitMemoryOrderOperand(inst->getOperand(2)); - m_writer->emit(");\n"); + if (isFloatingType(inst->getDataType())) + diagnoseFloatAtommic(); + + emitAtomicOp("atomic_fetch_sub", "atomic_fetch_sub_explicit"); return true; } case kIROp_AtomicAnd: { - emitInstResultDecl(inst); - m_writer->emit("atomic_fetch_and_explicit("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitMemoryOrderOperand(inst->getOperand(2)); - m_writer->emit(");\n"); + emitAtomicOp("atomic_fetch_and", "atomic_fetch_and_explicit"); return true; } case kIROp_AtomicOr: { - emitInstResultDecl(inst); - m_writer->emit("atomic_fetch_or_explicit("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitMemoryOrderOperand(inst->getOperand(2)); - m_writer->emit(");\n"); + emitAtomicOp("atomic_fetch_or", "atomic_fetch_or_explicit"); return true; } case kIROp_AtomicXor: { - emitInstResultDecl(inst); - m_writer->emit("atomic_fetch_xor_explicit("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitMemoryOrderOperand(inst->getOperand(2)); - m_writer->emit(");\n"); + emitAtomicOp("atomic_fetch_xor", "atomic_fetch_xor_explicit"); return true; } case kIROp_AtomicMin: { - emitInstResultDecl(inst); - m_writer->emit("atomic_fetch_min_explicit("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitMemoryOrderOperand(inst->getOperand(2)); - m_writer->emit(");\n"); + if (isFloatingType(inst->getDataType())) + diagnoseFloatAtommic(); + + emitAtomicOp("atomic_fetch_min", "atomic_fetch_min_explicit"); return true; } case kIROp_AtomicMax: { - emitInstResultDecl(inst); - m_writer->emit("atomic_fetch_max_explicit("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitMemoryOrderOperand(inst->getOperand(2)); - m_writer->emit(");\n"); + if (isFloatingType(inst->getDataType())) + diagnoseFloatAtommic(); + + emitAtomicOp("atomic_fetch_max", "atomic_fetch_max_explicit"); return true; } case kIROp_AtomicInc: { emitInstResultDecl(inst); - m_writer->emit("atomic_fetch_add_explicit("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit(", 1, "); - emitMemoryOrderOperand(inst->getOperand(1)); - m_writer->emit(");\n"); + bool isImageOp = false; + if (auto imageSubscript = isTextureAccess(inst)) + { + emitOperand(imageSubscript->getImage(), getInfo(EmitOp::Postfix)); + m_writer->emit(".atomic_fetch_add("); + emitAtomicImageCoord(imageSubscript); + isImageOp = true; + } + else + { + m_writer->emit("atomic_fetch_add_explicit("); + emitAtomicDestOperand(inst->getOperand(0)); + } + m_writer->emit(", 1"); + if (!isImageOp) + { + m_writer->emit(", "); + emitMemoryOrderOperand(inst->getOperand(1)); + } + if (isImageOp) + m_writer->emit(").x;\n"); + else + m_writer->emit(");\n"); return true; } case kIROp_AtomicDec: { emitInstResultDecl(inst); - m_writer->emit("atomic_fetch_sub_explicit("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit(", 1, "); - emitMemoryOrderOperand(inst->getOperand(1)); - m_writer->emit(");\n"); + bool isImageOp = false; + if (auto imageSubscript = isTextureAccess(inst)) + { + emitOperand(imageSubscript->getImage(), getInfo(EmitOp::Postfix)); + m_writer->emit(".atomic_fetch_sub("); + emitAtomicImageCoord(imageSubscript); + isImageOp = true; + } + else + { + m_writer->emit("atomic_fetch_sub_explicit("); + emitAtomicDestOperand(inst->getOperand(0)); + } + m_writer->emit(", 1"); + if (!isImageOp) + { + m_writer->emit(", "); + emitMemoryOrderOperand(inst->getOperand(1)); + } + if (isImageOp) + m_writer->emit(").x;\n"); + else + m_writer->emit(");\n"); return true; } } diff --git a/source/slang/slang-emit-metal.h b/source/slang/slang-emit-metal.h index 8e33eddefc..e0fe1f1c82 100644 --- a/source/slang/slang-emit-metal.h +++ b/source/slang/slang-emit-metal.h @@ -79,6 +79,11 @@ class MetalSourceEmitter : public CLikeSourceEmitter void _emitStageAccessSemantic(IRStageAccessDecoration* decoration, const char* name); bool _emitUserSemantic(UnownedStringSlice semanticName, IRIntegerValue semanticIndex); bool maybeEmitSystemSemantic(IRInst* inst); + + void emitAtomicImageCoord(IRImageSubscript* subscript); + void emitAtomicDestOperand(IRInst* operand); + void emitAtomicSrcOperand(bool isImage, IRInst* operand); + void emitAtomicSemanticOperand(IRInst* inst); }; } diff --git a/source/slang/slang-emit-spirv.cpp b/source/slang/slang-emit-spirv.cpp index 0f123b8fdc..62819e6d59 100644 --- a/source/slang/slang-emit-spirv.cpp +++ b/source/slang/slang-emit-spirv.cpp @@ -2929,11 +2929,11 @@ struct SPIRVEmitContext void ensureAtomicCapability(IRInst* atomicInst, SpvOp op) { + auto typeOp = atomicInst->getDataType()->getOp(); switch (op) { case SpvOpAtomicFAddEXT: { - auto typeOp = getVectorElementType(atomicInst->getDataType())->getOp(); switch (typeOp) { case kIROp_FloatType: @@ -2948,13 +2948,19 @@ struct SPIRVEmitContext ensureExtensionDeclaration(toSlice("SPV_EXT_shader_atomic_float16_add")); requireSPIRVCapability(SpvCapabilityAtomicFloat16AddEXT); break; + case kIROp_VectorType: + if (as(atomicInst->getDataType())->getElementType()->getOp() == kIROp_HalfType) + { + ensureExtensionDeclaration(toSlice("VK_NV_shader_atomic_float16_vector")); + requireSPIRVCapability(SpvCapabilityAtomicFloat16VectorNV); + } + break; } } break; case SpvOpAtomicFMinEXT: case SpvOpAtomicFMaxEXT: { - auto typeOp = getVectorElementType(atomicInst->getDataType())->getOp(); switch (typeOp) { case kIROp_FloatType: @@ -2969,10 +2975,24 @@ struct SPIRVEmitContext ensureExtensionDeclaration(toSlice("SPV_EXT_shader_atomic_float_min_max")); requireSPIRVCapability(SpvCapabilityAtomicFloat16MinMaxEXT); break; + case kIROp_VectorType: + if (as(atomicInst->getDataType())->getElementType()->getOp() == kIROp_HalfType) + { + ensureExtensionDeclaration(toSlice("VK_NV_shader_atomic_float16_vector")); + requireSPIRVCapability(SpvCapabilityAtomicFloat16VectorNV); + } + break; } } break; } + switch (typeOp) + { + case kIROp_UInt64Type: + case kIROp_Int64Type: + requireSPIRVCapability(SpvCapabilityInt64Atomics); + break; + } } // The instructions that appear inside the basic blocks of @@ -3321,6 +3341,7 @@ struct SPIRVEmitContext const auto memoryScope = emitIntConstant(IRIntegerValue{SpvScopeDevice}, builder.getUIntType()); const auto memorySemantics = emitMemorySemanticMask(inst->getOperand(1)); result = emitOpAtomicIIncrement(parent, inst, inst->getFullType(), inst->getOperand(0), memoryScope, memorySemantics); + ensureAtomicCapability(inst, SpvOpAtomicIIncrement); } break; case kIROp_AtomicDec: @@ -3329,6 +3350,7 @@ struct SPIRVEmitContext const auto memoryScope = emitIntConstant(IRIntegerValue{ SpvScopeDevice }, builder.getUIntType()); const auto memorySemantics = emitMemorySemanticMask(inst->getOperand(1)); result = emitOpAtomicIDecrement(parent, inst, inst->getFullType(), inst->getOperand(0), memoryScope, memorySemantics); + ensureAtomicCapability(inst, SpvOpAtomicIDecrement); } break; case kIROp_AtomicLoad: @@ -3337,6 +3359,7 @@ struct SPIRVEmitContext const auto memoryScope = emitIntConstant(IRIntegerValue{ SpvScopeDevice }, builder.getUIntType()); const auto memorySemantics = emitMemorySemanticMask(inst->getOperand(1)); result = emitOpAtomicLoad(parent, inst, inst->getFullType(), inst->getOperand(0), memoryScope, memorySemantics); + ensureAtomicCapability(inst, SpvOpAtomicLoad); } break; case kIROp_AtomicStore: @@ -3345,6 +3368,7 @@ struct SPIRVEmitContext const auto memoryScope = emitIntConstant(IRIntegerValue{ SpvScopeDevice }, builder.getUIntType()); const auto memorySemantics = emitMemorySemanticMask(inst->getOperand(2)); result = emitOpAtomicStore(parent, inst, inst->getOperand(0), memoryScope, memorySemantics, inst->getOperand(1)); + ensureAtomicCapability(inst, SpvOpAtomicStore); } break; case kIROp_AtomicExchange: @@ -3353,6 +3377,7 @@ struct SPIRVEmitContext const auto memoryScope = emitIntConstant(IRIntegerValue{ SpvScopeDevice }, builder.getUIntType()); const auto memorySemantics = emitMemorySemanticMask(inst->getOperand(2)); result = emitOpAtomicExchange(parent, inst, inst->getFullType(), inst->getOperand(0), memoryScope, memorySemantics, inst->getOperand(1)); + ensureAtomicCapability(inst, SpvOpAtomicExchange); } break; case kIROp_AtomicCompareExchange: @@ -3365,6 +3390,7 @@ struct SPIRVEmitContext parent, inst, inst->getFullType(), inst->getOperand(0), memoryScope, memorySemanticsEqual, memorySemanticsUnequal, inst->getOperand(2), inst->getOperand(1)); + ensureAtomicCapability(inst, SpvOpAtomicCompareExchange); } break; case kIROp_AtomicAdd: diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp index c9319a13bc..2206d29cff 100644 --- a/source/slang/slang-emit.cpp +++ b/source/slang/slang-emit.cpp @@ -53,9 +53,7 @@ #include "slang-ir-lower-l-value-cast.h" #include "slang-ir-lower-reinterpret.h" #include "slang-ir-loop-unroll.h" -#include "slang-ir-legalize-extract-from-texture-access.h" #include "slang-ir-legalize-image-subscript.h" -#include "slang-ir-legalize-is-texture-access.h" #include "slang-ir-legalize-vector-types.h" #include "slang-ir-metadata.h" #include "slang-ir-optix-entry-point-uniforms.h" @@ -1058,9 +1056,6 @@ Result linkAndOptimizeIR( legalizeVectorTypes(irModule, sink); - // Legalize `__isTextureAccess` and related. - legalizeIsTextureAccess(irModule, sink); - // Once specialization and type legalization have been performed, // we should perform some of our basic optimization steps again, // to see if we can clean up any temporaries created by legalization. @@ -1335,8 +1330,6 @@ Result linkAndOptimizeIR( // Create aliases for all dynamic resource parameters. if(requiredLoweringPassSet.dynamicResource && isKhronosTarget(targetRequest)) legalizeDynamicResourcesForGLSL(codeGenContext, irModule); - - legalizeExtractFromTextureAccess(irModule); // Legalize `ImageSubscript` loads. switch (target) diff --git a/source/slang/slang-intrinsic-expand.cpp b/source/slang/slang-intrinsic-expand.cpp index 7cde707773..aabc193dd4 100644 --- a/source/slang/slang-intrinsic-expand.cpp +++ b/source/slang/slang-intrinsic-expand.cpp @@ -653,112 +653,6 @@ const char* IntrinsicExpandContext::_emitSpecial(const char* cursor) } } break; - - case 'a': - { - // We have an operation that needs to lower to either - // `atomic*` or `imageAtomic*` for GLSL, depending on - // whether its first operand is a subscript into an - // array. This `$a` is the first `a` in `atomic`, - // so we will replace it accordingly. - // - // TODO: This distinction should be made earlier, - // with the front-end picking the right overload - // based on the "address space" of the argument. - - Index argIndex = 0; - SLANG_RELEASE_ASSERT(m_argCount > argIndex); - - auto arg = m_args[argIndex].get(); - if (arg->getOp() == kIROp_ImageSubscript) - { - m_writer->emit("imageA"); - } - else - { - m_writer->emit("a"); - } - } - break; - - case 'A': - { - // We have an operand that represents the destination - // of an atomic operation in GLSL, and it should - // be lowered based on whether it is an ordinary l-value, - // or an image subscript. In the image subscript case - // this operand will turn into multiple arguments - // to the `imageAtomic*` function. - // - - Index argIndex = 0; - SLANG_RELEASE_ASSERT(m_argCount > argIndex); - - auto arg = m_args[argIndex].get(); - if (arg->getOp() == kIROp_ImageSubscript) - { - if (m_emitter->getSourceLanguage() == SourceLanguage::GLSL) - { - // TODO: we don't handle the multisample - // case correctly here, where the last - // component of the image coordinate needs - // to be broken out into its own argument. - // - m_writer->emit("("); - m_emitter->emitOperand(arg->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit("), "); - - // The coordinate argument will have been computed - // as a `vector` because that is how the - // HLSL image subscript operations are defined. - // In contrast, the GLSL `imageAtomic*` operations - // expect `vector` coordinates, so we - // will hackily insert the conversion here as - // part of the intrinsic op. - // - auto coords = arg->getOperand(1); - auto coordsType = coords->getDataType(); - - auto coordsVecType = as(coordsType); - IRIntegerValue elementCount = 1; - if (coordsVecType) - { - coordsType = coordsVecType->getElementType(); - elementCount = getIntVal(coordsVecType->getElementCount()); - } - - SLANG_ASSERT(coordsType->getOp() == kIROp_UIntType); - - if (elementCount > 1) - { - m_writer->emit("ivec"); - m_writer->emit(elementCount); - } - else - { - m_writer->emit("int"); - } - - m_writer->emit("("); - m_emitter->emitOperand(arg->getOperand(1), getInfo(EmitOp::General)); - m_writer->emit(")"); - } - else - { - m_writer->emit("("); - m_emitter->emitOperand(arg, getInfo(EmitOp::General)); - m_writer->emit(")"); - } - } - else - { - m_writer->emit("("); - m_emitter->emitOperand(arg, getInfo(EmitOp::General)); - m_writer->emit(")"); - } - } - break; - case 'P': // Type-based prefix as used for CUDA and C++ targets { diff --git a/source/slang/slang-ir-legalize-extract-from-texture-access.cpp b/source/slang/slang-ir-legalize-extract-from-texture-access.cpp deleted file mode 100644 index de1e244a89..0000000000 --- a/source/slang/slang-ir-legalize-extract-from-texture-access.cpp +++ /dev/null @@ -1,136 +0,0 @@ -#include "slang-ir-legalize-extract-from-texture-access.h" - -#include "slang-ir.h" -#include "slang-ir-insts.h" -#include "slang-ir-util.h" -#include "slang-ir-clone.h" -#include "slang-ir-specialize-address-space.h" -#include "slang-parameter-binding.h" -#include "slang-ir-legalize-image-subscript.h" -#include "slang-ir-legalize-varying-params.h" -#include "slang-ir-simplify-cfg.h" - -namespace Slang -{ - void legalizeExtractTextureFromTextureAccess(IRBuilder& builder, IRInst* inst) - { - SLANG_ASSERT(inst); - - builder.setInsertBefore(inst); - IRImageSubscript* imageSubscript = as(getRootAddr(inst->getOperand(0))); - SLANG_ASSERT(imageSubscript); - SLANG_ASSERT(imageSubscript->getImage()); - inst->replaceUsesWith(imageSubscript->getImage()); - inst->removeAndDeallocate(); - // Ensure we are done processing the imageSubscript before we remove it - if (!imageSubscript->hasUses()) - imageSubscript->removeAndDeallocate(); - } - - void legalizeExtractArrayCoordFromTextureAccess(IRBuilder& builder, IRInst* inst) - { - SLANG_ASSERT(inst); - - builder.setInsertBefore(inst); - IRImageSubscript* imageSubscript = as(getRootAddr(inst->getOperand(0))); - SLANG_ASSERT(imageSubscript); - SLANG_ASSERT(imageSubscript->getImage()); - - auto image = as(imageSubscript->getImage()->getDataType()); - IRInst* coord = imageSubscript->getCoord(); - if(image->isArray()) - { - // Extract final element which is 'ArrayCoord' - IRVectorType* coordType = as(imageSubscript->getCoord()->getDataType()); - SLANG_ASSERT(coordType); - auto coordSize = getIRVectorElementSize(coordType); - - IRType* newArrayCoordType = coordType->getElementType(); - auto arrayCoordLocation = coordSize - 1; - List swizzleIndicies = { (UInt)arrayCoordLocation }; - - coord = builder.emitSwizzle(newArrayCoordType, coord, 1, swizzleIndicies.getBuffer()); - } - else - coord = builder.getIntValue(builder.getUIntType(), 0); - - - inst->replaceUsesWith(coord); - inst->removeAndDeallocate(); - // Ensure we are done processing the imageSubscript completly before we remove it - if (!imageSubscript->hasUses()) - imageSubscript->removeAndDeallocate(); - } - - void legalizeExtractCoordFromTextureAccess(IRBuilder& builder, IRInst* inst) - { - SLANG_ASSERT(inst); - - builder.setInsertBefore(inst); - IRImageSubscript* imageSubscript = as(getRootAddr(inst->getOperand(0))); - SLANG_ASSERT(imageSubscript); - SLANG_ASSERT(imageSubscript->getImage()); - - auto image = as(imageSubscript->getImage()->getDataType()); - IRInst* coord = imageSubscript->getCoord(); - if(image->isArray()) - { - // Extract all but final element which is 'ArrayCoord' - IRVectorType* coordType = as(imageSubscript->getCoord()->getDataType()); - auto coordSize = getIRVectorElementSize(coordType); - SLANG_ASSERT(coordType); - - IRType* newCoordType = nullptr; - auto newCoordSize = coordSize - 1; - if(newCoordSize != 1) - newCoordType = builder.getVectorType(coordType->getElementType(), newCoordSize); - else - newCoordType = coordType->getElementType(); - List swizzleIndicies = {1, 2, 3, 4}; - - coord = builder.emitSwizzle(newCoordType, coord, newCoordSize, swizzleIndicies.getBuffer()); - } - - inst->replaceUsesWith(coord); - inst->removeAndDeallocate(); - // Ensure we are done processing the imageSubscript completly before we remove it - if (!imageSubscript->hasUses()) - imageSubscript->removeAndDeallocate(); - } - - void legalizeExtractFromTextureAccess(IRModule* module) - { - IRBuilder builder(module); - for (auto globalInst : module->getModuleInst()->getChildren()) - { - auto func = as(globalInst); - if (!func) - continue; - for (auto block : func->getBlocks()) - { - auto inst = block->getFirstInst(); - IRInst* next; - for ( ; inst; inst = next) - { - next = inst->getNextInst(); - switch (inst->getOp()) - { - case kIROp_ExtractArrayCoordFromTextureAccess: - if (as(getRootAddr(inst->getOperand(0)))) - legalizeExtractArrayCoordFromTextureAccess(builder, inst); - continue; - case kIROp_ExtractCoordFromTextureAccess: - if (as(getRootAddr(inst->getOperand(0)))) - legalizeExtractCoordFromTextureAccess(builder, inst); - continue; - case kIROp_ExtractTextureFromTextureAccess: - if (as(getRootAddr(inst->getOperand(0)))) - legalizeExtractTextureFromTextureAccess(builder, inst); - continue; - } - } - } - } - } -} - diff --git a/source/slang/slang-ir-legalize-extract-from-texture-access.h b/source/slang/slang-ir-legalize-extract-from-texture-access.h deleted file mode 100644 index 016c86defb..0000000000 --- a/source/slang/slang-ir-legalize-extract-from-texture-access.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include "slang-ir.h" -#include "slang-compiler.h" - -namespace Slang -{ - class DiagnosticSink; - - void legalizeExtractFromTextureAccess(IRModule* module); -} diff --git a/source/slang/slang-ir-legalize-is-texture-access.cpp b/source/slang/slang-ir-legalize-is-texture-access.cpp deleted file mode 100644 index b9a0a7772a..0000000000 --- a/source/slang/slang-ir-legalize-is-texture-access.cpp +++ /dev/null @@ -1,79 +0,0 @@ -#include "slang-ir-legalize-is-texture-access.h" - -#include "slang-ir.h" -#include "slang-ir-insts.h" -#include "slang-ir-util.h" -#include "slang-ir-clone.h" -#include "slang-ir-specialize-address-space.h" -#include "slang-parameter-binding.h" -#include "slang-ir-legalize-image-subscript.h" -#include "slang-ir-legalize-varying-params.h" -#include "slang-ir-sccp.h" - -namespace Slang -{ - IRImageSubscript* getTextureAccess(IRInst* inst) - { - return as(getRootAddr(inst->getOperand(0))); - } - - void legalizeIsTextureAccess(IRModule* module, DiagnosticSink* sink) - { - HashSet functionsToSCCP; - IRBuilder builder(module); - for (auto globalInst : module->getModuleInst()->getChildren()) - { - auto func = as(globalInst); - if (!func) - continue; - for (auto block : func->getBlocks()) - { - auto inst = block->getFirstInst(); - IRInst* next; - for ( ; inst; inst = next) - { - next = inst->getNextInst(); - switch (inst->getOp()) - { - case kIROp_IsTextureAccess: - if (getTextureAccess(inst)) - inst->replaceUsesWith(builder.getBoolValue(true)); - else - inst->replaceUsesWith(builder.getBoolValue(false)); - inst->removeAndDeallocate(); - functionsToSCCP.add(func); - continue; - case kIROp_IsTextureArrayAccess: - { - auto textureAccess = getTextureAccess(inst); - if (textureAccess && as(textureAccess->getImage()->getDataType())->isArray()) - inst->replaceUsesWith(builder.getBoolValue(true)); - else - inst->replaceUsesWith(builder.getBoolValue(false)); - inst->removeAndDeallocate(); - functionsToSCCP.add(func); - continue; - } - case kIROp_IsTextureScalarAccess: - { - auto textureAccess = getTextureAccess(inst); - if (textureAccess && !as(as(textureAccess->getImage()->getDataType())->getElementType())) - inst->replaceUsesWith(builder.getBoolValue(true)); - else - inst->replaceUsesWith(builder.getBoolValue(false)); - inst->removeAndDeallocate(); - functionsToSCCP.add(func); - continue; - } - } - } - } - } - // Requires a SCCP to ensure Slang does not evaluate 'IRTextureType' code path - // and unresolved 'isTextureAccess' operations for when 'inst' is not a - // 'IRTextureType'/`TextureAccessor` - for (auto func : functionsToSCCP) - applySparseConditionalConstantPropagation(func, sink); - } -} - diff --git a/source/slang/slang-ir-legalize-is-texture-access.h b/source/slang/slang-ir-legalize-is-texture-access.h deleted file mode 100644 index 9b9e1cca0f..0000000000 --- a/source/slang/slang-ir-legalize-is-texture-access.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include "slang-ir.h" -#include "slang-compiler.h" - -namespace Slang -{ - class DiagnosticSink; - - void legalizeIsTextureAccess(IRModule* module, DiagnosticSink* sink); -} diff --git a/source/slang/slang-ir-use-uninitialized-values.cpp b/source/slang/slang-ir-use-uninitialized-values.cpp index 98fd9841a3..fea55de8de 100644 --- a/source/slang/slang-ir-use-uninitialized-values.cpp +++ b/source/slang/slang-ir-use-uninitialized-values.cpp @@ -315,8 +315,11 @@ namespace Slang case kIROp_Unmodified: return Store; - // ... and the rest will load/use them default: + // Default case is that if the instruction is a pointer, it + // is considered a store, otherwise a load. + if (as(user->getDataType())) + return Store; return Load; } } diff --git a/source/slang/slang-ir.cpp b/source/slang/slang-ir.cpp index d0dcfd4fbf..e0998779a1 100644 --- a/source/slang/slang-ir.cpp +++ b/source/slang/slang-ir.cpp @@ -5092,7 +5092,7 @@ namespace Slang auto inst = createInst( this, kIROp_AtomicStore, - nullptr, + getVoidType(), dstPtr, srcVal, memoryOrder); diff --git a/tests/bugs/gh-3997.slang b/tests/bugs/gh-3997.slang index 8c75da426a..d42e65e39a 100644 --- a/tests/bugs/gh-3997.slang +++ b/tests/bugs/gh-3997.slang @@ -10,7 +10,7 @@ float atomicAdd(__ref float value, float amount) __requirePrelude("#include "); __intrinsic_asm "std::atomic_ref(*$0).fetch_add($1)"; case spirv: - return __atomicAdd(value, amount); + return __atomic_add(value, amount); } } diff --git a/tests/compute/atomics-invalid-dest-type.slang b/tests/compute/atomics-invalid-dest-type.slang index 864debaee6..5ae03a5c7d 100644 --- a/tests/compute/atomics-invalid-dest-type.slang +++ b/tests/compute/atomics-invalid-dest-type.slang @@ -1,11 +1,8 @@ // atomics-buffer.slang -//TEST:SIMPLE(filecheck=CHECK): -target spirv -stage compute -entry computeMain -//TEST:SIMPLE(filecheck=CHECK): -target hlsl -stage compute -entry computeMain -//TEST:SIMPLE(filecheck=CHECK): -target glsl -stage compute -entry computeMain //TEST:SIMPLE(filecheck=CHECK): -target metal -stage compute -entry computeMain -//CHECK: Atomic must be applied to a scalar texture or non-texture +//CHECK: atomic operation on non-scalar texture RWBuffer outputBuffer; diff --git a/tests/compute/nonuniformres-atomic.slang b/tests/compute/nonuniformres-atomic.slang index 95ae502dcf..10dd30cb03 100644 --- a/tests/compute/nonuniformres-atomic.slang +++ b/tests/compute/nonuniformres-atomic.slang @@ -9,7 +9,7 @@ RWTexture2D texArray[2]; void main( uint2 dispatchThreadID : SV_DispatchThreadID, uint2 groupThreadID : SV_GroupThreadID ) { - // CHECK0: imageAtomicAdd((texArray_{{.*}}[nonuniformEXT({{.*}})] + // CHECK0: {{.*}}imageAtomicAdd(texArray_{{.*}}[nonuniformEXT({{.*}})] // CHECK1: InterlockedAdd(texArray_{{.*}}[NonUniformResourceIndex({{.*}})] diff --git a/tests/hlsl-intrinsic/texture/float-atomics.slang b/tests/hlsl-intrinsic/texture/float-atomics.slang index 02cb5570c8..913380416c 100644 --- a/tests/hlsl-intrinsic/texture/float-atomics.slang +++ b/tests/hlsl-intrinsic/texture/float-atomics.slang @@ -24,6 +24,6 @@ void computeMain(uint3 tid : SV_DispatchThreadID) AllMemoryBarrier(); // CHECK: 4.0 - outputBuffer[0] = t[uint2(1, 0)]; + outputBuffer[0] = t[uint2(1, 0)] + originalValue; } diff --git a/tests/metal/atomic-byteaddressbuffer.slang b/tests/metal/atomic-byteaddressbuffer.slang new file mode 100644 index 0000000000..677f80dbf5 --- /dev/null +++ b/tests/metal/atomic-byteaddressbuffer.slang @@ -0,0 +1,57 @@ +//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12 -profile cs_6_0 -use-dxil -shaderobj -output-using-type +//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-vk -compute -shaderobj -output-using-type +//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-cuda -compute -shaderobj -output-using-type +//TEST:SIMPLE(filecheck=LIB):-target metallib -entry computeMain -stage compute -DMETAL + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0]):name=uintBuffer +RWByteAddressBuffer uintBuffer; + +//TEST_INPUT: ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ], stride=4):out,name outputBuffer +RWStructuredBuffer outputBuffer; + +[numthreads(1,1,1)] +void computeMain() +{ + uintBuffer.InterlockedAdd(0, 1); + int oldValue; + //LIB: call {{.*}}.atomic.global.add.u.i32 + uintBuffer.InterlockedAdd(0, 1, oldValue); + // CHK: 1 + outputBuffer[0] = oldValue; + + uintBuffer.InterlockedAdd(0, 1, oldValue); + // CHK: 2 + outputBuffer[1] = (int)oldValue; + + uintBuffer.InterlockedCompareExchange(0, 3, 4, oldValue); + // CHK: 3 + outputBuffer[2] = (int)oldValue; + + uintBuffer.InterlockedOr(0, 3, oldValue); + // CHK: 4 + outputBuffer[3] = oldValue; // 4 + + uintBuffer.InterlockedExchange(0, 4, oldValue); + // CHK: 7 + outputBuffer[4] = oldValue; // 7 + + uintBuffer.InterlockedMin(0, 3, oldValue); + // CHK: 4 + outputBuffer[5] = oldValue; // 4 + + uintBuffer.InterlockedMax(0, 4, oldValue); + // CHK: 3 + outputBuffer[6] = oldValue; // 3 + + uintBuffer.InterlockedAnd(0, 7, oldValue); + // CHK: 4 + outputBuffer[7] = oldValue; // 4 + + uintBuffer.InterlockedXor(0, 7, oldValue); + // CHK: 4 + outputBuffer[8] = oldValue; // 4 + + // CHK: 3 + outputBuffer[9] = uintBuffer.Load(0); + +} \ No newline at end of file diff --git a/tests/metal/atomic-intrinsics.slang b/tests/metal/atomic-intrinsics.slang index 5d47db913b..afa0e53651 100644 --- a/tests/metal/atomic-intrinsics.slang +++ b/tests/metal/atomic-intrinsics.slang @@ -1,8 +1,7 @@ //TEST:SIMPLE(filecheck=MTL):-target metal -entry computeMain -stage compute -DMETAL //TEST:SIMPLE(filecheck=LIB):-target metallib -entry computeMain -stage compute -DMETAL //TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12 -profile cs_6_0 -use-dxil -shaderobj -output-using-type -//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-vk -emit-spirv-directly -compute -shaderobj -output-using-type -//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-vk -emit-spirv-via-glsl -compute -shaderobj -output-using-type +//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-vk -compute -shaderobj -output-using-type //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -shaderobj -output-using-type //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj -output-using-type @@ -36,22 +35,22 @@ void computeMain(uint groupIndex : SV_GroupIndex) float val = 0.0f; // InterlockedAdd - //MTL: atomic_uint threadgroup* {{.*}}shareMemUI + //MTL: atomic_uint threadgroup*{{.*}}shareMemUI //LIB: call {{.*}}.atomic.local.add.u.i32 InterlockedAdd(shareMemUI[idx], uint(1)); val += shareMemUI[idx]; - //MTL: atomic_int threadgroup* {{.*}}shareMemI + //MTL: atomic_int threadgroup*{{.*}}shareMemI //LIB: call {{.*}}.atomic.local.add.s.i32 InterlockedAdd(shareMemI[idx], 2); val += shareMemI[idx]; - //MTL: atomic_uint device* {{.*}}uintBuffer + //MTL: atomic_uint device*{{.*}}uintBuffer //LIB: call {{.*}}.atomic.global.add.u.i32 InterlockedAdd(uintBuffer[idx], 1); val += uintBuffer[idx]; - //MTL: atomic_int device* {{.*}}intBuffer + //MTL: atomic_int device*{{.*}}intBuffer //LIB: call {{.*}}.atomic.global.add.s.i32 InterlockedAdd(intBuffer[idx], 2); val += intBuffer[idx]; diff --git a/tests/metal/atomic-texture-buffer.slang b/tests/metal/atomic-texture-buffer.slang index 3e4eda94b6..1db1563646 100644 --- a/tests/metal/atomic-texture-buffer.slang +++ b/tests/metal/atomic-texture-buffer.slang @@ -2,7 +2,7 @@ //TEST:SIMPLE(filecheck=METAL_FLOAT): -target metal -stage compute -entry computeMain -DFLOAT //TEST:SIMPLE(filecheck=METALLIB): -target metallib -stage compute -entry computeMain -// METAL_FLOAT: 'float' atomic texture operations are disallowed with Metal target's +// METAL_FLOAT: floating point atomic operation //METALLIB: @computeMain diff --git a/tests/slang-extension/atomic-int64-byte-address-buffer.slang b/tests/slang-extension/atomic-int64-byte-address-buffer.slang index 9a7ae3b61b..61e38069de 100644 --- a/tests/slang-extension/atomic-int64-byte-address-buffer.slang +++ b/tests/slang-extension/atomic-int64-byte-address-buffer.slang @@ -5,7 +5,7 @@ // No support for int64_t on fxc - we need SM6.0 and dxil // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -nvapi-slot u0 -shaderobj -//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -nvapi-slot u0 -compile-arg -O2 -shaderobj +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -compile-arg -O2 -shaderobj //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -render-features atomic-int64 -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj diff --git a/tests/slang-extension/atomic-min-max-u64-byte-address-buffer.slang b/tests/slang-extension/atomic-min-max-u64-byte-address-buffer.slang index 4ab67df8e5..2fce9788ad 100644 --- a/tests/slang-extension/atomic-min-max-u64-byte-address-buffer.slang +++ b/tests/slang-extension/atomic-min-max-u64-byte-address-buffer.slang @@ -5,7 +5,7 @@ // No support for int64_t on fxc - we need SM6.0 and dxil // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -nvapi-slot u0 -shaderobj -//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -nvapi-slot u0 -compile-arg -O2 -shaderobj +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -compile-arg -O2 -shaderobj //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -render-features atomic-int64 -shaderobj // For some reason this doesn't work correctly on CUDA? That it behaves as if always does Min. Min and Max do appropriate // things tho, because if I force the condition I do get the right answer diff --git a/tests/slang-extension/cas-int64-byte-address-buffer.slang b/tests/slang-extension/cas-int64-byte-address-buffer.slang index 873f6ab4ba..2d31892155 100644 --- a/tests/slang-extension/cas-int64-byte-address-buffer.slang +++ b/tests/slang-extension/cas-int64-byte-address-buffer.slang @@ -5,7 +5,7 @@ // No support for int64_t on fxc - we need SM6.0 and dxil // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -nvapi-slot u0 -shaderobj -//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -nvapi-slot u0 -compile-arg -O2 -shaderobj +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -compile-arg -O2 -shaderobj //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -render-features atomic-int64 -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj diff --git a/tests/slang-extension/exchange-int64-byte-address-buffer.slang b/tests/slang-extension/exchange-int64-byte-address-buffer.slang index 84654ab803..a6c1277ac5 100644 --- a/tests/slang-extension/exchange-int64-byte-address-buffer.slang +++ b/tests/slang-extension/exchange-int64-byte-address-buffer.slang @@ -2,10 +2,7 @@ //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -shaderobj // No support for int64_t on DX11 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj -// No support for int64_t on fxc - we need SM6.0 and dxil -// https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12 -//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -nvapi-slot u0 -shaderobj -//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -nvapi-slot u0 -compile-arg -O2 -shaderobj +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -compile-arg -O2 -shaderobj //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -render-features atomic-int64 -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj diff --git a/tests/spirv/ref-this.slang b/tests/spirv/ref-this.slang index 5eaa7f3a14..de42639759 100644 --- a/tests/spirv/ref-this.slang +++ b/tests/spirv/ref-this.slang @@ -1,7 +1,7 @@ //TEST:SIMPLE(filecheck=CHECK): -target spirv // CHECK: %[[PTR:[0-9a-zA-Z_]+]] = OpAccessChain %_ptr_PhysicalStorageBuffer_uint %{{.*}} %int_0 -// CHECK: %original = OpAtomicIAdd %uint %[[PTR]] %uint_1 %uint_0 %uint_1 +// CHECK: %{{.*}} = OpAtomicIAdd %uint %[[PTR]] %uint_1 %uint_0 %uint_1 struct Buf {