Skip to content

Commit 8f5e459

Browse files
shijiashuaiqwencoder
andcommitted
fix: resolve NEON compilation errors on ARM and macOS
- Fix vminvq_u64 undeclared error: replace with manual lane extraction and OR reduction (vminvq_u64 not available in ARM NEON) - Fix vshlq_n_u64 and vshrq_n_u64 errors: replace compile-time constant intrinsics with scalar operations on individual lanes - Affects: ARM64 cross-compile, macOS ARM64 AppleClang builds Resolves CI failures for Linux ARM64 and macOS ARM64 platforms Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
1 parent f56a21c commit 8f5e459

1 file changed

Lines changed: 25 additions & 16 deletions

File tree

include/bitcal/neon_ops.hpp

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -141,13 +141,16 @@ BITCAL_FORCEINLINE void shift_left_256(uint64_t* data, int count) noexcept {
141141
hi = vsetq_lane_u64(carry, hi, 0);
142142

143143
if (count > 64) {
144-
count -= 64;
145-
lo = vshlq_n_u64(lo, count);
146-
144+
int shift = count - 64;
145+
uint64_t lo_lo_val = vgetq_lane_u64(lo, 0);
146+
uint64_t lo_hi_val = vgetq_lane_u64(lo, 1);
147+
lo = vsetq_lane_u64(lo_lo_val << shift, lo, 0);
148+
lo = vsetq_lane_u64(lo_hi_val << shift, lo, 1);
149+
147150
uint64_t hi_lo_val = vgetq_lane_u64(hi, 0);
148151
uint64_t hi_hi_val = vgetq_lane_u64(hi, 1);
149-
uint64_t new_hi_lo = (hi_lo_val << count);
150-
uint64_t new_hi_hi = (hi_hi_val << count) | (hi_lo_val >> (64 - count));
152+
uint64_t new_hi_lo = (hi_lo_val << shift);
153+
uint64_t new_hi_hi = (hi_hi_val << shift) | (hi_lo_val >> (64 - shift));
151154
hi = vsetq_lane_u64(new_hi_lo, hi, 0);
152155
hi = vsetq_lane_u64(new_hi_hi, hi, 1);
153156
}
@@ -192,13 +195,16 @@ BITCAL_FORCEINLINE void shift_right_256(uint64_t* data, int count) noexcept {
192195
lo = vsetq_lane_u64(vgetq_lane_u64(lo, 1), lo, 0);
193196

194197
if (count > 64) {
195-
count -= 64;
196-
hi = vshrq_n_u64(hi, count);
197-
198+
int shift = count - 64;
199+
uint64_t hi_lo_val = vgetq_lane_u64(hi, 0);
200+
uint64_t hi_hi_val = vgetq_lane_u64(hi, 1);
201+
hi = vsetq_lane_u64(hi_lo_val >> shift, hi, 0);
202+
hi = vsetq_lane_u64(hi_hi_val >> shift, hi, 1);
203+
198204
uint64_t lo_lo_val = vgetq_lane_u64(lo, 0);
199205
uint64_t lo_hi_val = vgetq_lane_u64(lo, 1);
200-
lo = vsetq_lane_u64((lo_lo_val >> count) | (lo_hi_val << (64 - count)), lo, 0);
201-
lo = vsetq_lane_u64(lo_hi_val >> count, lo, 1);
206+
lo = vsetq_lane_u64((lo_lo_val >> shift) | (lo_hi_val << (64 - shift)), lo, 0);
207+
lo = vsetq_lane_u64(lo_hi_val >> shift, lo, 1);
202208
}
203209
} else if (count > 0) {
204210
uint64_t lo_lo = vgetq_lane_u64(lo, 0);
@@ -451,26 +457,29 @@ BITCAL_FORCEINLINE void bit_andnot_512(const uint64_t* a, const uint64_t* b, uin
451457

452458
BITCAL_FORCEINLINE bool is_zero_128(const uint64_t* data) noexcept {
453459
uint64x2_t v = load(data);
454-
// vminvq_u64 returns the minimum value across all lanes
455-
// If all lanes are zero, the minimum is zero
456-
// If any lane is non-zero, the minimum will be non-zero (since values are unsigned)
457-
return vminvq_u64(v) == 0;
460+
uint64_t lo = vgetq_lane_u64(v, 0);
461+
uint64_t hi = vgetq_lane_u64(v, 1);
462+
return (lo | hi) == 0;
458463
}
459464

460465
BITCAL_FORCEINLINE bool is_zero_256(const uint64_t* data) noexcept {
461466
uint64x2_t v0 = load(data);
462467
uint64x2_t v1 = load(data + 2);
463468
// OR the two vectors together, then check if result is zero
464469
uint64x2_t combined = vorrq_u64(v0, v1);
465-
return vminvq_u64(combined) == 0;
470+
uint64_t lo = vgetq_lane_u64(combined, 0);
471+
uint64_t hi = vgetq_lane_u64(combined, 1);
472+
return (lo | hi) == 0;
466473
}
467474

468475
BITCAL_FORCEINLINE bool is_zero_512(const uint64_t* data) noexcept {
469476
uint64x2_t combined = vdupq_n_u64(0);
470477
for (int i = 0; i < 4; ++i) {
471478
combined = vorrq_u64(combined, load(data + i * 2));
472479
}
473-
return vminvq_u64(combined) == 0;
480+
uint64_t lo = vgetq_lane_u64(combined, 0);
481+
uint64_t hi = vgetq_lane_u64(combined, 1);
482+
return (lo | hi) == 0;
474483
}
475484

476485
}

0 commit comments

Comments
 (0)