Fix cast on constant<T>

stijnh · stijnh · commit 823a8cfdfd58 · 2026-04-10T17:04:30.000+02:00
diff --git a/include/kernel_float/constant.h b/include/kernel_float/constant.h
@@ -98,6 +98,20 @@ struct cast<constant<T>, R, m> {
         return cast<T, R, m> {}(input);
     }
 };
+
+template<typename T>
+struct cast<constant<T>, float> {
+    KERNEL_FLOAT_INLINE float operator()(const T& input) noexcept {
+        return cast<T, float> {}(input);
+    }
+};
+
+template<typename T, RoundingMode m>
+struct cast<constant<T>, float, m> {
+    KERNEL_FLOAT_INLINE float operator()(const T& input) noexcept {
+        return cast<T, float, m> {}(input);
+    }
+};
 }  // namespace ops
 
 #define KERNEL_FLOAT_CONSTANT_DEFINE_OP(OP)                                                    \
@@ -140,6 +154,13 @@ KERNEL_FLOAT_CONSTANT_DEFINE_OP(*)
 KERNEL_FLOAT_CONSTANT_DEFINE_OP(/)
 KERNEL_FLOAT_CONSTANT_DEFINE_OP(%)
 
+KERNEL_FLOAT_CONSTANT_DEFINE_OP(==)
+KERNEL_FLOAT_CONSTANT_DEFINE_OP(!=)
+KERNEL_FLOAT_CONSTANT_DEFINE_OP(<=)
+KERNEL_FLOAT_CONSTANT_DEFINE_OP(>=)
+KERNEL_FLOAT_CONSTANT_DEFINE_OP(<)
+KERNEL_FLOAT_CONSTANT_DEFINE_OP(>)
+
 }  // namespace kernel_float
 
 #endif
diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h
@@ -16,8 +16,8 @@
 
 //================================================================================
 // this file has been auto-generated, do not modify its contents!
-// date: 2026-04-09 10:28:01.035452
-// git hash: d4ea7202dd88aa23b79653ba45ffca3162e213bc
+// date: 2026-04-10 17:02:33.335438
+// git hash: 8f363a2146aff48ac4afc71f2283d91a6f1f65dd
 //================================================================================
 
 #ifndef KERNEL_FLOAT_MACROS_H
@@ -2225,6 +2225,20 @@ struct cast<constant<T>, R, m> {
         return cast<T, R, m> {}(input);
     }
 };
+
+template<typename T>
+struct cast<constant<T>, float> {
+    KERNEL_FLOAT_INLINE float operator()(const T& input) noexcept {
+        return cast<T, float> {}(input);
+    }
+};
+
+template<typename T, RoundingMode m>
+struct cast<constant<T>, float, m> {
+    KERNEL_FLOAT_INLINE float operator()(const T& input) noexcept {
+        return cast<T, float, m> {}(input);
+    }
+};
 }  // namespace ops
 
 #define KERNEL_FLOAT_CONSTANT_DEFINE_OP(OP)                                                    \
@@ -2267,6 +2281,13 @@ KERNEL_FLOAT_CONSTANT_DEFINE_OP(*)
 KERNEL_FLOAT_CONSTANT_DEFINE_OP(/)
 KERNEL_FLOAT_CONSTANT_DEFINE_OP(%)
 
+KERNEL_FLOAT_CONSTANT_DEFINE_OP(==)
+KERNEL_FLOAT_CONSTANT_DEFINE_OP(!=)
+KERNEL_FLOAT_CONSTANT_DEFINE_OP(<=)
+KERNEL_FLOAT_CONSTANT_DEFINE_OP(>=)
+KERNEL_FLOAT_CONSTANT_DEFINE_OP(<)
+KERNEL_FLOAT_CONSTANT_DEFINE_OP(>)
+
 }  // namespace kernel_float
 
 #endif
diff --git a/tests/common.h b/tests/common.h
@@ -351,7 +351,7 @@ struct device_runner {
     template<typename T, size_t N>
     void run() {
         if (cudaSetDevice(0) != cudaSuccess) {
-            FAIL("failed to initialize CUDA device, does this machine have a GPU?");
+            FAIL("failed to initialize CUDA device, run with '~[GPU]' to skip GPU tests");
         }
 
         for (int seed = 0; seed < 5; seed++) {
diff --git a/tests/constant.cu b/tests/constant.cu
@@ -1,8 +1,6 @@
 #include "common.h"
 
-#define ASSERT_TYPE(A, B) ASSERT(std::is_same<decltype(A), B>::value);
-
-struct constant_tests {
+struct constant_ops_tests {
     template<typename T>
     __host__ __device__ void operator()(generator<T> gen) {
         T value = gen.next();
@@ -33,8 +31,32 @@ struct constant_tests {
         //        ASSERT_EQ(value % kf::make_constant(5.0), value % T(5));
         //        ASSERT_EQ(kf::make_constant(5.0) % vector, T(5) % vector);
         //        ASSERT_EQ(vector % kf::make_constant(5.0), vector % T(5));
+
+        ASSERT_EQ(kf::cast<double>(kf::make_constant(T(5.0))), kf::make_vec(5.0));
+        ASSERT_EQ(kf::cast<float>(kf::make_constant(T(5.0))), kf::make_vec(5.0f));
+        ASSERT_EQ(kf::cast<int>(kf::make_constant(T(5.0))), kf::make_vec(5));
+
+        ASSERT_EQ(kf::cast<T>(kf::make_constant(double(5.0))), kf::make_vec(T(5.0)));
+        ASSERT_EQ(kf::cast<T>(kf::make_constant(float(5.0))), kf::make_vec(T(5.0)));
+        ASSERT_EQ(kf::cast<T>(kf::make_constant(int(5.0))), kf::make_vec(T(5.0)));
+    }
+};
+
+REGISTER_TEST_CASE("constant ops tests", constant_ops_tests, int, float, double)
+REGISTER_TEST_CASE_GPU("constant ops tests", constant_ops_tests, __half, __nv_bfloat16)
+
+struct constant_eq_tests {
+    template<typename T>
+    __host__ __device__ void operator()(generator<T> gen) {
+        ASSERT(kf::make_constant(T(5.0)) == double(5.0));
+        ASSERT(kf::make_constant(T(5.0)) == float(5.0));
+        ASSERT(kf::make_constant(T(5.0)) == int(5.0));
+
+        ASSERT(kf::make_constant(double(5.0)) == T(5.0));
+        ASSERT(kf::make_constant(float(5.0)) == T(5.0));
+        ASSERT(kf::make_constant(int(5.0)) == T(5.0));
     }
 };
 
-REGISTER_TEST_CASE("constant tests", constant_tests, int, float, double)
-REGISTER_TEST_CASE_GPU("constant tests", constant_tests, __half, __nv_bfloat16)
+REGISTER_TEST_CASE("constant eq tests", constant_eq_tests, int, float, double)
+REGISTER_TEST_CASE_GPU("constant eq tests", constant_eq_tests, __half, __nv_bfloat16)
diff --git a/tests/unops.cu b/tests/unops.cu
@@ -8,13 +8,13 @@ struct unops_tests {
         kf::vec<T, N> b;
 
         b = -a;
-        ASSERT(equals(b[I], T(-items[I])) && ...);
+        ASSERT_EQ_ALL(b[I], T(-items[I]));
 
         b = ~a;
-        ASSERT(equals(b[I], T(~items[I])) && ...);
+        ASSERT_EQ_ALL(b[I], T(~items[I]));
 
         b = !a;
-        ASSERT(equals(b[I], T(!items[I])) && ...);
+        ASSERT_EQ_ALL(b[I], T(!items[I]));
     }
 };
 
@@ -28,58 +28,59 @@ struct unops_float_tests {
         kf::vec<T, N> b;
 
         b = -a;
-        ASSERT(equals(b[I], T(-items[I])) && ...);
+        ASSERT_EQ_ALL(b[I], T(-items[I]));
 
         b = !a;
-        ASSERT(equals(b[I], items[I] == 0.0 ? T(1.0) : T(0.0)) && ...);
+        ASSERT_EQ_ALL(b[I], items[I] == 0.0 ? T(1.0) : T(0.0));
 
         // Ideally, we would test all unary operators, but that would be a lot of work and not that useful since
         // all operators are generated by the same macro. Instead, we only check a few of them
         if constexpr (is_one_of<T, __half, __nv_bfloat16>) {
             // operations on 16-bit numbers are only supported in CC >= 8
 #if KERNEL_FLOAT_CUDA_ARCH >= 800
             b = sqrt(a);
-            ASSERT(equals(b[I], hsqrt(T(items[I]))) && ...);
+            ASSERT_EQ_ALL(b[I], hsqrt(T(items[I])));
 
             b = sin(a);
-            ASSERT(equals(b[I], hsin(T(items[I]))) && ...);
+            ASSERT_EQ_ALL(b[I], hsin(T(items[I])));
 
             b = cos(a);
-            ASSERT(equals(b[I], hcos(T(items[I]))) && ...);
+            ASSERT_EQ_ALL(b[I], hcos(T(items[I])));
 
             b = log(a);
-            ASSERT(equals(b[I], hlog(T(items[I]))) && ...);
+            ASSERT_EQ_ALL(b[I], hlog(T(items[I])));
 
             b = exp(a);
-            ASSERT(equals(b[I], hexp(T(items[I]))) && ...);
+            ASSERT_EQ_ALL(b[I], hexp(T(items[I])));
 
             b = rcp(a);
-            ASSERT(equals(b[I], hrcp(T(items[I]))) && ...);
+            ASSERT_EQ_ALL(b[I], hrcp(T(items[I])));
 
             b = rsqrt(a);
-            ASSERT(equals(b[I], hrsqrt(T(items[I]))) && ...);
+            ASSERT_EQ_ALL(b[I], hrsqrt(T(items[I])));
 #endif
         } else {
             b = sqrt(a);
-            ASSERT(equals(b[I], sqrt(T(items[I]))) && ...);
+            ASSERT_EQ_ALL(b[I], sqrt(T(items[I])));
 
             b = sin(a);
-            ASSERT(equals(b[I], sin(T(items[I]))) && ...);
+            ASSERT_EQ_ALL(b[I], sin(T(items[I])));
 
             b = cos(a);
-            ASSERT(equals(b[I], cos(T(items[I]))) && ...);
+            ASSERT_EQ_ALL(b[I], cos(T(items[I])));
 
             b = log(a);
-            ASSERT(equals(b[I], log(T(items[I]))) && ...);
+            ASSERT_EQ_ALL(b[I], log(T(items[I])));
 
             b = exp(a);
-            ASSERT(equals(b[I], exp(T(items[I]))) && ...);
+            ASSERT_EQ_ALL(b[I], exp(T(items[I])));
 
             b = rcp(a);
-            ASSERT(equals(b[I], rcp(T(items[I]))) && ...);
+            ASSERT_EQ_ALL(b[I], T(1.0 / items[I]));
 
+            // seems that rsqrt does not match bitwise on GPU
             b = rsqrt(a);
-            ASSERT(equals(b[I], rsqrt(T(items[I]))) && ...);
+            ASSERT_APPROX_ALL(b[I], rsqrt(T(items[I])));
         }
     }
 };

Original file line number	Diff line number	Diff line change
`@@ -351,7 +351,7 @@ struct device_runner {`
`351`	`351`	`template<typename T, size_t N>`
`352`	`352`	`void run() {`
`353`	`353`	`if (cudaSetDevice(0) != cudaSuccess) {`
`354`		`- FAIL("failed to initialize CUDA device, does this machine have a GPU?");`
	`354`	`+ FAIL("failed to initialize CUDA device, run with '~[GPU]' to skip GPU tests");`
`355`	`355`	`}`
`356`	`356`
`357`	`357`	`for (int seed = 0; seed < 5; seed++) {`