@@ -141,13 +141,16 @@ BITCAL_FORCEINLINE void shift_left_256(uint64_t* data, int count) noexcept {
141141 hi = vsetq_lane_u64 (carry, hi, 0 );
142142
143143 if (count > 64 ) {
144- count -= 64 ;
145- lo = vshlq_n_u64 (lo, count);
146-
144+ int shift = count - 64 ;
145+ uint64_t lo_lo_val = vgetq_lane_u64 (lo, 0 );
146+ uint64_t lo_hi_val = vgetq_lane_u64 (lo, 1 );
147+ lo = vsetq_lane_u64 (lo_lo_val << shift, lo, 0 );
148+ lo = vsetq_lane_u64 (lo_hi_val << shift, lo, 1 );
149+
147150 uint64_t hi_lo_val = vgetq_lane_u64 (hi, 0 );
148151 uint64_t hi_hi_val = vgetq_lane_u64 (hi, 1 );
149- uint64_t new_hi_lo = (hi_lo_val << count );
150- uint64_t new_hi_hi = (hi_hi_val << count ) | (hi_lo_val >> (64 - count ));
152+ uint64_t new_hi_lo = (hi_lo_val << shift );
153+ uint64_t new_hi_hi = (hi_hi_val << shift ) | (hi_lo_val >> (64 - shift ));
151154 hi = vsetq_lane_u64 (new_hi_lo, hi, 0 );
152155 hi = vsetq_lane_u64 (new_hi_hi, hi, 1 );
153156 }
@@ -192,13 +195,16 @@ BITCAL_FORCEINLINE void shift_right_256(uint64_t* data, int count) noexcept {
192195 lo = vsetq_lane_u64 (vgetq_lane_u64 (lo, 1 ), lo, 0 );
193196
194197 if (count > 64 ) {
195- count -= 64 ;
196- hi = vshrq_n_u64 (hi, count);
197-
198+ int shift = count - 64 ;
199+ uint64_t hi_lo_val = vgetq_lane_u64 (hi, 0 );
200+ uint64_t hi_hi_val = vgetq_lane_u64 (hi, 1 );
201+ hi = vsetq_lane_u64 (hi_lo_val >> shift, hi, 0 );
202+ hi = vsetq_lane_u64 (hi_hi_val >> shift, hi, 1 );
203+
198204 uint64_t lo_lo_val = vgetq_lane_u64 (lo, 0 );
199205 uint64_t lo_hi_val = vgetq_lane_u64 (lo, 1 );
200- lo = vsetq_lane_u64 ((lo_lo_val >> count ) | (lo_hi_val << (64 - count )), lo, 0 );
201- lo = vsetq_lane_u64 (lo_hi_val >> count , lo, 1 );
206+ lo = vsetq_lane_u64 ((lo_lo_val >> shift ) | (lo_hi_val << (64 - shift )), lo, 0 );
207+ lo = vsetq_lane_u64 (lo_hi_val >> shift , lo, 1 );
202208 }
203209 } else if (count > 0 ) {
204210 uint64_t lo_lo = vgetq_lane_u64 (lo, 0 );
@@ -451,26 +457,29 @@ BITCAL_FORCEINLINE void bit_andnot_512(const uint64_t* a, const uint64_t* b, uin
451457
452458BITCAL_FORCEINLINE bool is_zero_128 (const uint64_t * data) noexcept {
453459 uint64x2_t v = load (data);
454- // vminvq_u64 returns the minimum value across all lanes
455- // If all lanes are zero, the minimum is zero
456- // If any lane is non-zero, the minimum will be non-zero (since values are unsigned)
457- return vminvq_u64 (v) == 0 ;
460+ uint64_t lo = vgetq_lane_u64 (v, 0 );
461+ uint64_t hi = vgetq_lane_u64 (v, 1 );
462+ return (lo | hi) == 0 ;
458463}
459464
460465BITCAL_FORCEINLINE bool is_zero_256 (const uint64_t * data) noexcept {
461466 uint64x2_t v0 = load (data);
462467 uint64x2_t v1 = load (data + 2 );
463468 // OR the two vectors together, then check if result is zero
464469 uint64x2_t combined = vorrq_u64 (v0, v1);
465- return vminvq_u64 (combined) == 0 ;
470+ uint64_t lo = vgetq_lane_u64 (combined, 0 );
471+ uint64_t hi = vgetq_lane_u64 (combined, 1 );
472+ return (lo | hi) == 0 ;
466473}
467474
468475BITCAL_FORCEINLINE bool is_zero_512 (const uint64_t * data) noexcept {
469476 uint64x2_t combined = vdupq_n_u64 (0 );
470477 for (int i = 0 ; i < 4 ; ++i) {
471478 combined = vorrq_u64 (combined, load (data + i * 2 ));
472479 }
473- return vminvq_u64 (combined) == 0 ;
480+ uint64_t lo = vgetq_lane_u64 (combined, 0 );
481+ uint64_t hi = vgetq_lane_u64 (combined, 1 );
482+ return (lo | hi) == 0 ;
474483}
475484
476485}
0 commit comments