Skip to content

Commit e10da06

Browse files
committed
Add LSX and LASX implementations
1 parent a02d0ca commit e10da06

10 files changed

Lines changed: 799 additions & 1 deletion

File tree

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ aarch64_neon = []
3838
aarch64_neon_prefetch = []
3939

4040
armv7_neon = []
41+
loongarch64_lsx = []
4142

4243
# make the portable SIMD public implementation available (experimental, nightly only)
4344
portable_public_imp = ["public_imp"]

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ This library has been thoroughly tested with sample data as well as fuzzing and
1818
* ARM64 (aarch64) SIMD is supported since Rust 1.61
1919
* WASM (wasm32) SIMD is supported
2020
* 🆕 armv7 NEON support with the `armv7_neon` feature on nightly Rust
21+
* 🆕 loongarch64 LSX/LASX support with the `loongarch64_lsx` feature on nightly Rust
2122
* x86-64: Up to 23 times faster than the std library on valid non-ASCII, up to four times faster on ASCII
2223
* aarch64: Up to eleven times faster than the std library on valid non-ASCII, up to four times faster on ASCII (Apple Silicon)
2324
* Faster than the original simdjson implementation
@@ -92,6 +93,11 @@ runtime using the `std::arch::is_arm_feature_detected!` macro unless the CPU tar
9293
`RUSTFLAGS="-C target-feature=+neon"`. Some targets such as `thumbv7neon-linux-androideabi` and `thumbv7neon-unknown-linux-gnueabihf`
9394
have NEON enabled by default.
9495

96+
### LoongArch64
97+
Requires a recent nightly Rust compiler. The `loongarch64_neon` feature needs to be enabled.
98+
The fastest implementation is usually selected at runtime. Compiling with the `lsx` or `lasx` target features enabled
99+
selects the LSX/LASX implementations at compile time.
100+
95101
### WASM32
96102
For wasm32 support, the implementation is selected at compile time based on the presence of the `simd128` target feature.
97103
Use `RUSTFLAGS="-C target-feature=+simd128"` to enable the WASM SIMD implementation. WASM, at

bench/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ simdutf8_wasmtime = ["wasmtime"]
2929
[dependencies]
3030
core_affinity = "0.8.1"
3131
criterion = "0.8.1"
32-
simdutf8 = { version = "*", path = "..", features = ["aarch64_neon"] }
32+
simdutf8 = { version = "*", path = "..", features = ["aarch64_neon", "loongarch64_lsx"] }
3333
simdjson-utf8 = { version = "*", path = "simdjson-utf8", optional = true }
3434
# default is cranelift which is not as performant as the llvm backend
3535
wasmer = { version = "2.1", optional = true, default-features = false }

src/basic.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,30 @@ pub mod imp {
260260
}
261261
}
262262

263+
/// Includes the loongarch64 SIMD implementations.
264+
#[cfg(all(feature = "loongarch64_lsx", target_arch = "loongarch64"))]
265+
pub mod loongarch64 {
266+
/// Includes the LASX-based validation implementation for loongarch64 CPUs.
267+
///
268+
/// Using the provided functionality on CPUs which do not support LASX is undefined
269+
/// behavior and will very likely cause a crash.
270+
pub mod lasx {
271+
pub use crate::implementation::loongarch64::lasx::validate_utf8_basic as validate_utf8;
272+
pub use crate::implementation::loongarch64::lasx::ChunkedUtf8ValidatorImp;
273+
pub use crate::implementation::loongarch64::lasx::Utf8ValidatorImp;
274+
}
275+
276+
/// Includes the LSX-based validation implementation for loongarch64 CPUs.
277+
///
278+
/// Using the provided functionality on CPUs which do not support LSX is undefined
279+
/// behavior and will very likely cause a crash.
280+
pub mod lsx {
281+
pub use crate::implementation::loongarch64::lsx::validate_utf8_basic as validate_utf8;
282+
pub use crate::implementation::loongarch64::lsx::ChunkedUtf8ValidatorImp;
283+
pub use crate::implementation::loongarch64::lsx::Utf8ValidatorImp;
284+
}
285+
}
286+
263287
/// Includes the wasm32 SIMD implementations.
264288
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
265289
pub mod wasm32 {

src/compat.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,26 @@ pub mod imp {
146146
}
147147
}
148148

149+
/// Includes the loongarch64 LSX SIMD implementations.
150+
#[cfg(all(feature = "loongarch64_lsx", target_arch = "loongarch64"))]
151+
pub mod loongarch64 {
152+
/// Includes the LASX-based validation implementation for loongarch64 LASX-compatible CPUs.
153+
///
154+
/// Using the provided functionality on CPUs which do not support LASX is undefined
155+
/// behavior and will very likely cause a crash.
156+
pub mod lasx {
157+
pub use crate::implementations::loongarch64::lasx::validate_utf8_compat as validate_utf8;
158+
}
159+
160+
/// Includes the LSX-based validation implementation for loongarch64 LSX-compatible CPUs.
161+
///
162+
/// Using the provided functionality on CPUs which do not support LSX is undefined
163+
/// behavior and will very likely cause a crash.
164+
pub mod lsx {
165+
pub use crate::implementations::loongarch64::lsx::validate_utf8_compat as validate_utf8;
166+
}
167+
}
168+
149169
/// Includes the wasm32 SIMD implementations.
150170
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
151171
pub mod wasm32 {
Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
//! Contains the loongarch64 LASX UTF-8 validation implementation.
2+
3+
#[cfg(target_arch = "loongarch64")]
4+
use core::arch::loongarch64::{
5+
lasx_xvand_v, lasx_xvld, lasx_xvldi, lasx_xvmskltz_b, lasx_xvmsknz_b, lasx_xvor_v,
6+
lasx_xvpermi_q, lasx_xvpickve2gr_d, lasx_xvreplgr2vr_b, lasx_xvshuf_b, lasx_xvsrli_b,
7+
lasx_xvssub_bu, lasx_xvxor_v, m256i,
8+
};
9+
10+
use crate::implementation::helpers::Utf8CheckAlgorithm;
11+
12+
// LASX SIMD primitives
13+
14+
type SimdU8Value = crate::implementation::helpers::SimdU8Value<m256i>;
15+
16+
impl SimdU8Value {
17+
#[flexpect::e(clippy::too_many_arguments)]
18+
#[flexpect::e(clippy::cast_possible_wrap)]
19+
#[target_feature(enable = "lasx")]
20+
#[inline]
21+
unsafe fn from_32_cut_off_leading(
22+
v0: u8,
23+
v1: u8,
24+
v2: u8,
25+
v3: u8,
26+
v4: u8,
27+
v5: u8,
28+
v6: u8,
29+
v7: u8,
30+
v8: u8,
31+
v9: u8,
32+
v10: u8,
33+
v11: u8,
34+
v12: u8,
35+
v13: u8,
36+
v14: u8,
37+
v15: u8,
38+
v16: u8,
39+
v17: u8,
40+
v18: u8,
41+
v19: u8,
42+
v20: u8,
43+
v21: u8,
44+
v22: u8,
45+
v23: u8,
46+
v24: u8,
47+
v25: u8,
48+
v26: u8,
49+
v27: u8,
50+
v28: u8,
51+
v29: u8,
52+
v30: u8,
53+
v31: u8,
54+
) -> Self {
55+
let arr: [u8; 32] = [
56+
v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
57+
v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
58+
];
59+
Self::from(lasx_xvld::<0>(arr.as_ptr().cast()))
60+
}
61+
62+
#[flexpect::e(clippy::too_many_arguments)]
63+
#[target_feature(enable = "lasx")]
64+
#[inline]
65+
unsafe fn repeat_16(
66+
v0: u8,
67+
v1: u8,
68+
v2: u8,
69+
v3: u8,
70+
v4: u8,
71+
v5: u8,
72+
v6: u8,
73+
v7: u8,
74+
v8: u8,
75+
v9: u8,
76+
v10: u8,
77+
v11: u8,
78+
v12: u8,
79+
v13: u8,
80+
v14: u8,
81+
v15: u8,
82+
) -> Self {
83+
Self::from_32_cut_off_leading(
84+
v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v0, v1, v2, v3,
85+
v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
86+
)
87+
}
88+
89+
#[flexpect::e(clippy::cast_ptr_alignment)]
90+
#[target_feature(enable = "lasx")]
91+
#[inline]
92+
unsafe fn load_from(ptr: *const u8) -> Self {
93+
Self::from(lasx_xvld::<0>(ptr.cast()))
94+
}
95+
96+
#[flexpect::e(clippy::too_many_arguments)]
97+
#[target_feature(enable = "lasx")]
98+
#[inline]
99+
unsafe fn lookup_16(
100+
self,
101+
v0: u8,
102+
v1: u8,
103+
v2: u8,
104+
v3: u8,
105+
v4: u8,
106+
v5: u8,
107+
v6: u8,
108+
v7: u8,
109+
v8: u8,
110+
v9: u8,
111+
v10: u8,
112+
v11: u8,
113+
v12: u8,
114+
v13: u8,
115+
v14: u8,
116+
v15: u8,
117+
) -> Self {
118+
let src = Self::repeat_16(
119+
v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
120+
)
121+
.0;
122+
123+
Self::from(lasx_xvshuf_b(src, src, self.0))
124+
}
125+
126+
#[flexpect::e(clippy::cast_possible_wrap)]
127+
#[target_feature(enable = "lasx")]
128+
#[inline]
129+
unsafe fn splat(val: u8) -> Self {
130+
Self::from(lasx_xvreplgr2vr_b(val as i32))
131+
}
132+
133+
#[target_feature(enable = "lasx")]
134+
#[inline]
135+
unsafe fn splat0() -> Self {
136+
Self::from(lasx_xvldi::<0>())
137+
}
138+
139+
#[target_feature(enable = "lasx")]
140+
#[inline]
141+
unsafe fn or(self, b: Self) -> Self {
142+
Self::from(lasx_xvor_v(self.0, b.0))
143+
}
144+
145+
#[target_feature(enable = "lasx")]
146+
#[inline]
147+
unsafe fn and(self, b: Self) -> Self {
148+
Self::from(lasx_xvand_v(self.0, b.0))
149+
}
150+
151+
#[target_feature(enable = "lasx")]
152+
#[inline]
153+
unsafe fn xor(self, b: Self) -> Self {
154+
Self::from(lasx_xvxor_v(self.0, b.0))
155+
}
156+
157+
#[target_feature(enable = "lasx")]
158+
#[inline]
159+
unsafe fn saturating_sub(self, b: Self) -> Self {
160+
Self::from(lasx_xvssub_bu(self.0, b.0))
161+
}
162+
163+
// ugly but shr<N> requires const generics
164+
#[target_feature(enable = "lasx")]
165+
#[inline]
166+
unsafe fn shr4(self) -> Self {
167+
Self::from(lasx_xvsrli_b::<4>(self.0))
168+
}
169+
170+
// ugly but prev<N> requires const generics
171+
#[target_feature(enable = "lasx")]
172+
#[inline]
173+
unsafe fn prev1(self, prev: Self) -> Self {
174+
// This lets us end up with [ prev_hi | self_lo ]
175+
let bridge = lasx_xvpermi_q(self.0, prev.0, 0x21);
176+
// It shuffles [ b_lo | a_lo ] | [ b_hi | a_hi ]
177+
// ...aka [ bridge_lo | self_lo ] | [ bridge_hi | self_hi ]
178+
// ...aka [ prev_hi | self_lo ] | [ self_lo | self_hi ]
179+
let mask = [
180+
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 15, 16, 17, 18, 19, 20,
181+
21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
182+
];
183+
Self::from(lasx_xvshuf_b(self.0, bridge, lasx_xvld::<0>(mask.as_ptr())))
184+
}
185+
186+
// ugly but prev<N> requires const generics
187+
#[target_feature(enable = "lasx")]
188+
#[inline]
189+
unsafe fn prev2(self, prev: Self) -> Self {
190+
let bridge = lasx_xvpermi_q(self.0, prev.0, 0x21);
191+
let mask = [
192+
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 14, 15, 16, 17, 18, 19,
193+
20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
194+
];
195+
Self::from(lasx_xvshuf_b(self.0, bridge, lasx_xvld::<0>(mask.as_ptr())))
196+
}
197+
198+
// ugly but prev<N> requires const generics
199+
#[target_feature(enable = "lasx")]
200+
#[inline]
201+
unsafe fn prev3(self, prev: Self) -> Self {
202+
let bridge = lasx_xvpermi_q(self.0, prev.0, 0x21);
203+
let mask = [
204+
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 13, 14, 15, 16, 17, 18,
205+
19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
206+
];
207+
Self::from(lasx_xvshuf_b(self.0, bridge, lasx_xvld::<0>(mask.as_ptr())))
208+
}
209+
210+
#[target_feature(enable = "lasx")]
211+
#[inline]
212+
unsafe fn any_bit_set(self) -> bool {
213+
let nonzero_mask = lasx_xvmsknz_b(self.0);
214+
let lo = lasx_xvpickve2gr_d::<0>(nonzero_mask);
215+
let hi = lasx_xvpickve2gr_d::<2>(nonzero_mask);
216+
lo != 0 || hi != 0
217+
}
218+
219+
#[target_feature(enable = "lasx")]
220+
#[inline]
221+
unsafe fn is_ascii(self) -> bool {
222+
let high_bits = lasx_xvmskltz_b(self.0);
223+
let lo = lasx_xvpickve2gr_d::<0>(high_bits);
224+
let hi = lasx_xvpickve2gr_d::<2>(high_bits);
225+
(lo | hi) == 0
226+
}
227+
}
228+
229+
impl From<m256i> for SimdU8Value {
230+
#[inline]
231+
fn from(val: m256i) -> Self {
232+
Self(val)
233+
}
234+
}
235+
236+
impl Utf8CheckAlgorithm<SimdU8Value> {
237+
#[target_feature(enable = "lasx")]
238+
#[inline]
239+
unsafe fn must_be_2_3_continuation(prev2: SimdU8Value, prev3: SimdU8Value) -> SimdU8Value {
240+
let is_third_byte = prev2.saturating_sub(SimdU8Value::splat(0xe0 - 0x80));
241+
let is_fourth_byte = prev3.saturating_sub(SimdU8Value::splat(0xf0 - 0x80));
242+
is_third_byte.or(is_fourth_byte)
243+
}
244+
}
245+
246+
#[inline]
247+
unsafe fn simd_prefetch(_ptr: *const u8) {}
248+
249+
const PREFETCH: bool = false;
250+
use crate::implementation::helpers::TempSimdChunkA32 as TempSimdChunk;
251+
simd_input_256_bit!(#[target_feature(enable = "lasx")]);
252+
algorithm_simd!(#[target_feature(enable = "lasx")]);

0 commit comments

Comments
 (0)