diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index bb67c1abf2a62..35c7e731b7d11 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -218,6 +218,12 @@ BUILTIN(__nvvm_sin_approx_f, "ff", "") BUILTIN(__nvvm_cos_approx_ftz_f, "ff", "") BUILTIN(__nvvm_cos_approx_f, "ff", "") +// Tanh + +TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_75,PTX70)) +TARGET_BUILTIN(__nvvm_tanh_approx_f16, "hh", "", AND(SM_75, PTX70)) +TARGET_BUILTIN(__nvvm_tanh_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70)) + // Fma BUILTIN(__nvvm_fma_rn_ftz_f, "ffff", "") diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h index 07542feb9d814..dafccbf00bb19 100644 --- a/clang/include/clang/Basic/TargetOptions.h +++ b/clang/include/clang/Basic/TargetOptions.h @@ -78,6 +78,9 @@ class TargetOptions { /// \brief If enabled, use precise square root bool NVVMCudaPrecSqrt = false; + /// \brief If enabled, use approximate tanh + bool NVVMCudaApproxTanhf = false; + /// \brief If enabled, allow AMDGPU unsafe floating point atomics. bool AllowAMDGPUUnsafeFPAtomics = false; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index ad85abdf1bb9b..e650b342f157d 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4726,7 +4726,10 @@ def fno_sycl_device_lib_EQ : CommaJoined<["-"], "fno-sycl-device-lib=">, Group, HelpText<"Control exclusion of " "device libraries from device binary linkage. Valid arguments " "are libc, libm-fp32, libm-fp64, all">; - +defm nvvm_cuda_approx_tanh : BoolFOption<"sycl-cuda-approx-tanh", + TargetOpts<"NVVMCudaApproxTanhf">, DefaultFalse, + PosFlag=7.5">, + NegFlag>; //===----------------------------------------------------------------------===// // FLangOption + CoreOption + NoXarchOption //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index e049069354838..741d0992a2ee0 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -782,6 +782,12 @@ void CodeGenModule::Release() { getTarget().getTargetOpts().NVVMCudaPrecSqrt); } + if (LangOpts.isSYCL() && getTriple().isNVPTX()) { + getModule().addModuleFlag(llvm::Module::Override, + "nvvm-reflect-approx-tanhf", + getTarget().getTargetOpts().NVVMCudaApproxTanhf); + } + if (LangOpts.EHAsynch) getModule().addModuleFlag(llvm::Module::Warning, "eh-asynch", 1); diff --git a/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu b/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu new file mode 100644 index 0000000000000..ec5c335d047d3 --- /dev/null +++ b/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu @@ -0,0 +1,11 @@ +// RUN: %clang_cc1 -fsycl-is-device -triple nvptx64-nvidia-cuda -emit-llvm -fsycl-cuda-approx-tanh %s -o -| FileCheck --check-prefix=CHECK-ON %s +// RUN: %clang_cc1 -fsycl-is-device -triple nvptx64-nvidia-cuda -emit-llvm %s -o -| FileCheck --check-prefix=CHECK-OFF %s + +#include "Inputs/cuda.h" + +// Check that the -fsycl-cuda-approx-tanh flag correctly sets the nvvm-reflect module flags. + +extern "C" __device__ void foo() {} + +// CHECK-ON: !{i32 4, !"nvvm-reflect-approx-tanhf", i32 1} +// CHECK-OFF: !{i32 4, !"nvvm-reflect-approx-tanhf", i32 0} diff --git a/libclc/generic/include/clcmacro.h b/libclc/generic/include/clcmacro.h index d4167a8e4529e..b53043aa19667 100644 --- a/libclc/generic/include/clcmacro.h +++ b/libclc/generic/include/clcmacro.h @@ -9,11 +9,7 @@ #ifndef __CLC_MACRO_H #define __CLC_MACRO_H -#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \ - DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \ - return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y)); \ - } \ - \ +#define _CLC_UNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) { \ return (RET_TYPE##3)(FUNCTION(x.x), FUNCTION(x.y), FUNCTION(x.z)); \ } \ @@ -30,12 +26,14 @@ return (RET_TYPE##16)(FUNCTION(x.lo), FUNCTION(x.hi)); \ } -#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ - ARG2_TYPE) \ - DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) { \ - return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y)); \ +#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \ + return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y)); \ } \ - \ + _CLC_UNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) + +#define _CLC_BINARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE) \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y) { \ return (RET_TYPE##3)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y), \ FUNCTION(x.z, y.z)); \ @@ -53,6 +51,14 @@ return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \ } +#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) { \ + return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y)); \ + } \ + _CLC_BINARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE) + #define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ ARG2_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) { \ @@ -107,6 +113,15 @@ FUNCTION(x.hi, y.hi, z.hi)); \ } +#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE, ARG3_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, \ + ARG3_TYPE##2 z) { \ + return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \ + } \ + _CLC_TERNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE, ARG3_TYPE) + #define _CLC_V_S_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ ARG2_TYPE, ARG3_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##2 z) { \ diff --git a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl index f443c36c04117..9a36fc24036f1 100644 --- a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl +++ b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl @@ -11,7 +11,45 @@ #include "../../include/libdevice.h" #include -#define __CLC_FUNCTION __spirv_ocl_tanh -#define __CLC_BUILTIN __nv_tanh -#define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f) -#include +extern int __clc_nvvm_reflect_arch(); +extern int __clc_nvvm_reflect_approx_tanh(); + +#define __USE_TANH_APPROX \ + (__clc_nvvm_reflect_approx_tanh() && (__clc_nvvm_reflect_arch() >= 750)) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEF _CLC_OVERLOAD double __spirv_ocl_tanh(double x) { + return __nv_tanh(x); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_tanh, double) + +#endif + +_CLC_DEF _CLC_OVERLOAD float __spirv_ocl_tanh(float x) { + return (__USE_TANH_APPROX) ? __nvvm_tanh_approx_f(x) : __nv_tanhf(x); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __spirv_ocl_tanh, float) + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEF _CLC_OVERLOAD half __spirv_ocl_tanh(half x) { + return (__USE_TANH_APPROX) ? __nvvm_tanh_approx_f16(x) : __nv_tanhf(x); +} + +_CLC_DEF _CLC_OVERLOAD half2 __spirv_ocl_tanh(half2 x) { + return (__USE_TANH_APPROX) ? __nvvm_tanh_approx_f16x2(x) + : (half2)(__nv_tanhf(x.x), __nv_tanhf(x.y)); +} + +_CLC_UNARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_tanh, half) + +#endif + +#undef __USE_TANH_APPROX diff --git a/libclc/ptx-nvidiacl/libspirv/reflect.ll b/libclc/ptx-nvidiacl/libspirv/reflect.ll index 91ae4135644d5..978489adaf191 100755 --- a/libclc/ptx-nvidiacl/libspirv/reflect.ll +++ b/libclc/ptx-nvidiacl/libspirv/reflect.ll @@ -6,3 +6,10 @@ define i32 @__clc_nvvm_reflect_arch() alwaysinline { %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([12 x i8], [12 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*)) ret i32 %reflect } + +@str_approx_tanh = private addrspace(1) constant [20 x i8] c"__CUDA_APPROX_TANHF\00" + +define i32 @__clc_nvvm_reflect_approx_tanh() alwaysinline { + %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([20 x i8], [20 x i8] addrspace(1)* @str_approx_tanh, i32 0, i32 0) to i8*)) + ret i32 %reflect +} diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index fb7598d6ac75d..8e5ad53fc2987 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -808,6 +808,17 @@ let TargetPrefix = "nvvm" in { def int_nvvm_cos_approx_f : GCCBuiltin<"__nvvm_cos_approx_f">, DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; +// +// Tanh +// + + def int_nvvm_tanh_approx_f : GCCBuiltin<"__nvvm_tanh_approx_f">, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_nvvm_tanh_approx_f16 : GCCBuiltin<"__nvvm_tanh_approx_f16">, + DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty], [IntrNoMem]>; + def int_nvvm_tanh_approx_f16x2 : GCCBuiltin<"__nvvm_tanh_approx_f16x2">, + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty], [IntrNoMem]>; + // // Fma // diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index d7c8acd146082..f0bacf9dadce7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -784,6 +784,17 @@ def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;", def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;", Float32Regs, Float32Regs, int_nvvm_cos_approx_f>; +// +// Tanh +// + +def INT_NVVM_TANH_APPROX_F : F_MATH_1<"tanh.approx.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_tanh_approx_f>; +def INT_NVVM_TANH_APPROX_F16 : F_MATH_1<"tanh.approx.f16 \t$dst, $src0;", + Float16Regs, Float16Regs, int_nvvm_tanh_approx_f16>; +def INT_NVVM_TANH_APPROX_F16X2 : F_MATH_1<"tanh.approx.f16x2 \t$dst, $src0;", + Float16x2Regs, Float16x2Regs, int_nvvm_tanh_approx_f16x2>; + // // Fma // diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 6ffc49a59a551..396fa18805202 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -176,6 +176,12 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { if (auto *Flag = mdconst::extract_or_null( F.getParent()->getModuleFlag("nvvm-reflect-prec-sqrt"))) ReflectVal = Flag->getSExtValue(); + } else if (ReflectArg == "__CUDA_APPROX_TANHF") { + // Try to pull __CUDA_APPROX_TANHF from the nvvm-reflect-approx-tanhf + // module flag. + if (auto *Flag = mdconst::extract_or_null( + F.getParent()->getModuleFlag("nvvm-reflect-approx-tanhf"))) + ReflectVal = Flag->getSExtValue(); } Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal)); ToRemove.push_back(Call);