diff --git a/packages/cubejs-backend-native/src/orchestrator.rs b/packages/cubejs-backend-native/src/orchestrator.rs index 9b41660b2b606..469c0bbbe72a8 100644 --- a/packages/cubejs-backend-native/src/orchestrator.rs +++ b/packages/cubejs-backend-native/src/orchestrator.rs @@ -2,7 +2,8 @@ use crate::node_obj_deserializer::JsValueDeserializer; use crate::transport::MapCubeErrExt; use cubeorchestrator::query_message_parser::QueryResult; use cubeorchestrator::query_result_transform::{ - DBResponsePrimitive, RequestResultData, RequestResultDataMulti, TransformedData, + DBResponsePrimitive, InternedKeyLookup, RequestResultData, RequestResultDataMulti, + TransformedData, }; use cubeorchestrator::transport::{JsRawColumnarData, TransformDataRequest}; use cubesql::compile::engine::df::scan::{ColumnarValueObject, FieldValue, ValueObject}; @@ -212,7 +213,8 @@ impl ValueObject for ResultWrapper { ))); }; - row.get(field_name).unwrap_or(&DBResponsePrimitive::Null) + row.get(&InternedKeyLookup::new(field_name)) + .unwrap_or(&DBResponsePrimitive::Null) } }; diff --git a/packages/cubejs-dremio-driver/driver/DremioQuery.js b/packages/cubejs-dremio-driver/driver/DremioQuery.js index 8913937d47806..eb62241e44a1c 100644 --- a/packages/cubejs-dremio-driver/driver/DremioQuery.js +++ b/packages/cubejs-dremio-driver/driver/DremioQuery.js @@ -165,6 +165,8 @@ class DremioQuery extends BaseQuery { templates.functions.DATEDIFF = 'DATE_DIFF(DATE, DATE_TRUNC(\'{{ date_part }}\', {{ args[1] }}), DATE_TRUNC(\'{{ date_part }}\', {{ args[2] }}))'; templates.functions.STRING_AGG = 'LISTAGG({% if distinct %}DISTINCT {% endif %}{{ args_concat }})'; templates.expressions.interval_single_date_part = 'CAST({{ num }} as INTERVAL {{ date_part }})'; + templates.expressions.like = '{{ expr }} {% if negated %}NOT {% endif %}LIKE {{ pattern }}{% if default_escape %} ESCAPE \'\\\'{% endif %}'; + delete templates.expressions.ilike; templates.quotes.identifiers = '"'; return templates; } diff --git a/packages/cubejs-duckdb-driver/src/DuckDBQuery.ts b/packages/cubejs-duckdb-driver/src/DuckDBQuery.ts index e101fa200da96..27ed2bb535df2 100644 --- a/packages/cubejs-duckdb-driver/src/DuckDBQuery.ts +++ b/packages/cubejs-duckdb-driver/src/DuckDBQuery.ts @@ -64,6 +64,8 @@ export class DuckDBQuery extends BaseQuery { templates.functions.LEAST = 'LEAST({{ args_concat }})'; templates.functions.GREATEST = 'GREATEST({{ args_concat }})'; templates.functions.STRING_AGG = 'STRING_AGG({% if distinct %}DISTINCT {% endif %}{{ args[0] }}, COALESCE({{ args[1] }}, \'\'))'; + templates.expressions.like = '{{ expr }} {% if negated %}NOT {% endif %}LIKE {{ pattern }}{% if default_escape %} ESCAPE \'\\\'{% endif %}'; + templates.expressions.ilike = '{{ expr }} {% if negated %}NOT {% endif %}ILIKE {{ pattern }}{% if default_escape %} ESCAPE \'\\\'{% endif %}'; return templates; } diff --git a/packages/cubejs-schema-compiler/src/adapter/BaseFilter.ts b/packages/cubejs-schema-compiler/src/adapter/BaseFilter.ts index f4fea50f91cb2..5dedae94e9afa 100644 --- a/packages/cubejs-schema-compiler/src/adapter/BaseFilter.ts +++ b/packages/cubejs-schema-compiler/src/adapter/BaseFilter.ts @@ -114,7 +114,15 @@ export class BaseFilter extends BaseDimension { } public isWildcardOperator() { - return this.camelizeOperator === 'contains' || this.camelizeOperator === 'notContains'; + // All LIKE-based operators need wildcard chars escaped in user values + return ( + this.camelizeOperator === 'contains' || + this.camelizeOperator === 'notContains' || + this.camelizeOperator === 'startsWith' || + this.camelizeOperator === 'notStartsWith' || + this.camelizeOperator === 'endsWith' || + this.camelizeOperator === 'notEndsWith' + ); } public filterParams() { diff --git a/packages/cubejs-schema-compiler/src/adapter/MssqlQuery.ts b/packages/cubejs-schema-compiler/src/adapter/MssqlQuery.ts index 45399322202ce..f4f33a0ffc07d 100644 --- a/packages/cubejs-schema-compiler/src/adapter/MssqlQuery.ts +++ b/packages/cubejs-schema-compiler/src/adapter/MssqlQuery.ts @@ -274,6 +274,7 @@ export class MssqlQuery extends BaseQuery { delete templates.functions.STRING_AGG; // PERCENTILE_CONT works but requires PARTITION BY delete templates.functions.PERCENTILECONT; + templates.expressions.like = '{{ expr }} {% if negated %}NOT {% endif %}LIKE {{ pattern }}{% if default_escape %} ESCAPE \'\\\'{% endif %}'; delete templates.expressions.ilike; // MSSQL uses + for string concatenation instead of || templates.expressions.concat_strings = '{{ strings | join(\' + \' ) }}'; diff --git a/packages/cubejs-schema-compiler/src/adapter/PrestodbQuery.ts b/packages/cubejs-schema-compiler/src/adapter/PrestodbQuery.ts index cebb88a6d5749..ef6f1548ff5c6 100644 --- a/packages/cubejs-schema-compiler/src/adapter/PrestodbQuery.ts +++ b/packages/cubejs-schema-compiler/src/adapter/PrestodbQuery.ts @@ -161,6 +161,7 @@ export class PrestodbQuery extends BaseQuery { templates.expressions.binary = '{% if op == \'||\' %}' + '(CAST({{ left }} AS VARCHAR) || CAST({{ right }} AS VARCHAR))' + '{% else %}({{ left }} {{ op }} {{ right }}){% endif %}'; + templates.expressions.like = '{{ expr }} {% if negated %}NOT {% endif %}LIKE {{ pattern }}{% if default_escape %} ESCAPE \'\\\'{% endif %}'; delete templates.expressions.ilike; templates.types.string = 'VARCHAR'; templates.types.float = 'REAL'; diff --git a/packages/cubejs-schema-compiler/src/adapter/SnowflakeQuery.ts b/packages/cubejs-schema-compiler/src/adapter/SnowflakeQuery.ts index 3523d623987f0..4267eaa6888b8 100644 --- a/packages/cubejs-schema-compiler/src/adapter/SnowflakeQuery.ts +++ b/packages/cubejs-schema-compiler/src/adapter/SnowflakeQuery.ts @@ -116,6 +116,8 @@ export class SnowflakeQuery extends BaseQuery { templates.expressions.extract = 'EXTRACT({{ date_part }} FROM {{ expr }})'; templates.expressions.interval = 'INTERVAL \'{{ interval }}\''; templates.expressions.timestamp_literal = '\'{{ value }}\'::timestamp_tz'; + templates.expressions.like = '{{ expr }} {% if negated %}NOT {% endif %}LIKE {{ pattern }}{% if default_escape %} ESCAPE \'\\\\\'{% endif %}'; + templates.expressions.ilike = '{{ expr }} {% if negated %}NOT {% endif %}ILIKE {{ pattern }}{% if default_escape %} ESCAPE \'\\\\\'{% endif %}'; templates.operators.is_not_distinct_from = 'IS NOT DISTINCT FROM'; templates.join_types.full = 'FULL'; delete templates.types.interval; diff --git a/rust/cube/cubeorchestrator/Cargo.toml b/rust/cube/cubeorchestrator/Cargo.toml index 600766a8d2531..e07a074f81ac9 100644 --- a/rust/cube/cubeorchestrator/Cargo.toml +++ b/rust/cube/cubeorchestrator/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" [dependencies] chrono = { version = "0.4.31", features = ["serde"] } cubeshared = { path = "../cubeshared" } -serde = { version = "1.0.217", features = ["derive"] } +serde = { version = "1.0.217", features = ["derive", "rc"] } serde_json = "1.0.133" anyhow = "1.0" itertools = "0.13.0" diff --git a/rust/cube/cubeorchestrator/src/query_result_transform.rs b/rust/cube/cubeorchestrator/src/query_result_transform.rs index 9c08428fe89ab..2bfef54e329d0 100644 --- a/rust/cube/cubeorchestrator/src/query_result_transform.rs +++ b/rust/cube/cubeorchestrator/src/query_result_transform.rs @@ -7,7 +7,7 @@ use crate::{ }; use anyhow::{bail, Context, Result}; use chrono::{DateTime, NaiveDateTime, TimeZone, Utc}; -use indexmap::IndexMap; +use indexmap::{Equivalent, IndexMap}; use itertools::multizip; use serde::{ de::{self, MapAccess, SeqAccess, Visitor}, @@ -15,8 +15,9 @@ use serde::{ }; use serde_json::Value; use std::{ - collections::{HashMap, HashSet}, + collections::{hash_map::DefaultHasher, HashMap, HashSet}, fmt::Display, + hash::{BuildHasher, Hash, Hasher}, sync::{Arc, LazyLock}, }; @@ -40,6 +41,138 @@ pub static GRANULARITY_LEVELS: LazyLock> = LazyLock::n }); const DEFAULT_LEVEL_FOR_UNKNOWN: u8 = 10; +/// IndexMap key whose hash is computed once at construction. Combined with +/// [`PrehashedBuildHasher`], this makes per-row `insert` skip the SipHash13 +/// pass over the string bytes — the hasher just stores and returns the +/// pre-computed `u64`. +pub struct InternedKey { + hash: u64, + text: Box, +} + +impl InternedKey { + pub fn new(text: &str) -> Self { + Self { + hash: hash_str(text), + text: text.into(), + } + } + + pub fn as_str(&self) -> &str { + &self.text + } +} + +fn hash_str(s: &str) -> u64 { + let mut h = DefaultHasher::new(); + s.hash(&mut h); + h.finish() +} + +impl Hash for InternedKey { + fn hash(&self, state: &mut H) { + state.write_u64(self.hash); + } +} + +impl PartialEq for InternedKey { + fn eq(&self, other: &Self) -> bool { + self.hash == other.hash && self.text == other.text + } +} +impl Eq for InternedKey {} + +impl std::fmt::Debug for InternedKey { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Debug::fmt(&self.text, f) + } +} + +impl std::fmt::Display for InternedKey { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(&self.text) + } +} + +impl Serialize for InternedKey { + fn serialize(&self, serializer: S) -> Result { + serializer.serialize_str(&self.text) + } +} + +impl<'de> Deserialize<'de> for InternedKey { + fn deserialize>(deserializer: D) -> Result { + let text: String = String::deserialize(deserializer)?; + Ok(InternedKey::new(&text)) + } +} + +/// Lookup key for `IndexMap, V, PrehashedBuildHasher>` that +/// avoids allocating an `Arc` per lookup when the caller only has +/// a borrowed `&str` (e.g. per-cell `field_name` lookups from the SQL scan +/// path in `cubejs-backend-native`). Computes the hash of the borrowed `&str` +/// once at construction. +pub struct InternedKeyLookup<'a> { + hash: u64, + text: &'a str, +} + +impl<'a> InternedKeyLookup<'a> { + pub fn new(text: &'a str) -> Self { + Self { + hash: hash_str(text), + text, + } + } +} + +impl Hash for InternedKeyLookup<'_> { + fn hash(&self, state: &mut H) { + state.write_u64(self.hash); + } +} + +impl Equivalent> for InternedKeyLookup<'_> { + fn equivalent(&self, key: &Arc) -> bool { + self.hash == key.hash && self.text == key.as_str() + } +} + +/// Pass-through [`BuildHasher`] for IndexMaps keyed by [`InternedKey`] / +/// [`InternedKeyLookup`]: takes the `u64` they emit and returns it unchanged. +#[derive(Default, Clone)] +pub struct PrehashedBuildHasher; + +impl BuildHasher for PrehashedBuildHasher { + type Hasher = PrehashedHasher; + + fn build_hasher(&self) -> PrehashedHasher { + PrehashedHasher(0) + } +} + +pub struct PrehashedHasher(u64); + +impl Hasher for PrehashedHasher { + fn finish(&self) -> u64 { + self.0 + } + + fn write(&mut self, _bytes: &[u8]) { + unreachable!("PrehashedHasher only accepts pre-computed u64 hashes via write_u64"); + } + + fn write_u64(&mut self, n: u64) { + self.0 = n; + } +} + +pub type VanillaRow = IndexMap, DBResponsePrimitive, PrehashedBuildHasher>; + +pub fn empty_vanilla_row(capacity: usize) -> VanillaRow { + IndexMap::with_capacity_and_hasher(capacity, PrehashedBuildHasher) +} + /// Transform specified `value` with specified `type` to the network protocol type. pub fn transform_value(value: DBResponsePrimitive, type_: &str) -> DBResponsePrimitive { match value { @@ -418,9 +551,10 @@ pub fn get_compact_row( /// and member-name parsing for every cell. pub struct VanillaColumnPlan<'a> { column_index: usize, - member_name: &'a str, + /// Interned IndexMap key for this column with a pre-computed hash. + /// Cloned via [`Arc::clone`] per row (atomic refcount inc). + key: Arc, member_type: &'a str, - granularity_track: Option>, } pub(crate) struct VanillaGranularityTrack<'a> { @@ -429,9 +563,38 @@ pub(crate) struct VanillaGranularityTrack<'a> { level: u8, } +/// Resolved at plan time: for each deprecated-style base time dimension (one +/// that appears in the query only via `{cube}.{dim}.{granularity}` aliases), +/// the list of source columns whose value can be reused under the bare +/// `{cube}.{dim}` key. Candidates are kept in column-encounter order. At row +/// time we pick the lowest-level candidate whose value is actually present — +/// so a row missing the finest column still falls back to a coarser one, as +/// the previous per-row HashMap did. Ties resolve to the last column. +pub(crate) struct VanillaGranularityExtra { + /// Interned IndexMap key for the bare `{cube}.{dim}` base member. + /// Built once at plan time and cloned via [`Arc::clone`] per row. + base_key: Arc, + candidates: Vec<(u8, Arc)>, +} + pub struct VanillaPlan<'a> { columns: Vec>, - has_granularity_tracking: bool, + minimal_granularity_extras: Vec, + /// Pre-computed tail entry that depends only on the query, not the row. + tail: VanillaTail, +} + +enum VanillaTail { + None, + CompareDateRange { + key: Arc, + value: DBResponsePrimitive, + }, + Blending { + blending_key: Arc, + /// Used only for lookup against the per-row map — never inserted. + response_key: InternedKey, + }, } pub fn build_vanilla_plan<'a>( @@ -439,9 +602,10 @@ pub fn build_vanilla_plan<'a>( alias_to_member_name_map: &'a HashMap, annotation: &'a HashMap, query: &NormalizedQuery, + query_type: &QueryType, ) -> Result> { let mut columns = Vec::with_capacity(columns_pos.len()); - let mut has_granularity_tracking = false; + let mut candidates_for_base: IndexMap<&'a str, Vec<(u8, Arc)>> = IndexMap::new(); for (alias, &index) in columns_pos { let member_name = match alias_to_member_name_map.get(alias) { @@ -452,25 +616,50 @@ pub fn build_vanilla_plan<'a>( let annotation_for_member = annotation.get(member_name).unwrap(); let member_type = annotation_for_member.member_type.as_deref().unwrap_or(""); - // Handle deprecated time dimensions without granularity. - // Try to collect minimal granularity value for time dimensions without granularity - // as there might be more than one granularity column for the same dimension. - let granularity_track = compute_vanilla_granularity_track(member_name, query); - if granularity_track.is_some() { - has_granularity_tracking = true; + let key = Arc::new(InternedKey::new(member_name)); + + if let Some(track) = compute_vanilla_granularity_track(member_name, query) { + candidates_for_base + .entry(track.base_member) + .or_default() + .push((track.level, Arc::clone(&key))); } columns.push(VanillaColumnPlan { column_index: index, - member_name, + key, member_type, - granularity_track, }); } + let minimal_granularity_extras = candidates_for_base + .into_iter() + .map(|(base_member, candidates)| VanillaGranularityExtra { + base_key: Arc::new(InternedKey::new(base_member)), + candidates, + }) + .collect(); + + let tail = match query_type { + QueryType::CompareDateRangeQuery => VanillaTail::CompareDateRange { + key: Arc::new(InternedKey::new(COMPARE_DATE_RANGE_FIELD)), + value: get_date_range_value(query.time_dimensions.as_ref())?, + }, + QueryType::BlendingQuery => VanillaTail::Blending { + blending_key: Arc::new(InternedKey::new(&get_blending_query_key( + query.time_dimensions.as_ref(), + )?)), + response_key: InternedKey::new(&get_blending_response_key( + query.time_dimensions.as_ref(), + )?), + }, + _ => VanillaTail::None, + }; + Ok(VanillaPlan { columns, - has_granularity_tracking, + minimal_granularity_extras, + tail, }) } @@ -633,67 +822,63 @@ fn build_columnar_columns( columns } -/// Convert DB response object to the vanilla output format. +/// Convert DB response object to the vanilla output format. Keys are +/// pre-hashed [`InternedKey`] values shared via [`Arc::clone`] from the plan, +/// turning per-cell hashing/key allocation into an atomic refcount inc. pub fn get_vanilla_row( plan: &VanillaPlan<'_>, - query_type: &QueryType, - query: &NormalizedQuery, db_row: &[DBResponsePrimitive], -) -> Result> { +) -> Result { // +1 to cover the optional tail entry (compareDateRange / blending key). - let mut row = IndexMap::with_capacity(plan.columns.len() + 1); - - if plan.has_granularity_tracking { - // FIXME: For now custom granularities are not supported, only common ones. - // There is no granularity type/class implementation in rust yet. - let mut minimal_granularities: HashMap<&str, (u8, DBResponsePrimitive)> = HashMap::new(); - - for column in &plan.columns { - if let Some(value) = db_row.get(column.column_index) { - let transformed_value = transform_value(value.clone(), column.member_type); - row.insert(column.member_name.to_string(), transformed_value.clone()); - - if let Some(track) = &column.granularity_track { - match minimal_granularities.get(track.base_member) { - Some((existing_level, _)) if *existing_level < track.level => {} - _ => { - minimal_granularities - .insert(track.base_member, (track.level, transformed_value)); - } - } + let mut row = IndexMap::with_capacity_and_hasher( + plan.columns.len() + plan.minimal_granularity_extras.len() + 1, + PrehashedBuildHasher, + ); + + for column in &plan.columns { + if let Some(value) = db_row.get(column.column_index) { + let transformed_value = transform_value(value.clone(), column.member_type); + row.insert(Arc::clone(&column.key), transformed_value); + } + } + + // Handle deprecated time dimensions without granularity. The candidate + // columns were collected at plan build time; pick the lowest-level one + // whose transformed value is actually present in this row + if !plan.minimal_granularity_extras.is_empty() { + for extra in &plan.minimal_granularity_extras { + let mut best: Option<(u8, &DBResponsePrimitive)> = None; + + for (level, source_key) in &extra.candidates { + let Some(value) = row.get::(source_key) else { + continue; + }; + + match best { + Some((best_level, _)) if best_level < *level => {} + _ => best = Some((*level, value)), } } - } - // Handle deprecated time dimensions without granularity - for (base_member, (_, value)) in minimal_granularities { - row.insert(base_member.to_string(), value); - } - } else { - // Fast path: no column needs granularity bookkeeping. Skip the HashMap - // entirely and move the transformed value straight into the row. - for column in &plan.columns { - if let Some(value) = db_row.get(column.column_index) { - let transformed_value = transform_value(value.clone(), column.member_type); - row.insert(column.member_name.to_string(), transformed_value); + if let Some((_, value)) = best { + row.insert(Arc::clone(&extra.base_key), value.clone()); } } } - match query_type { - QueryType::CompareDateRangeQuery => { - let date_range_value = get_date_range_value(query.time_dimensions.as_ref())?; - row.insert("compareDateRange".to_string(), date_range_value); + match &plan.tail { + VanillaTail::None => {} + VanillaTail::CompareDateRange { key, value } => { + row.insert(Arc::clone(key), value.clone()); } - QueryType::BlendingQuery => { - let blending_key = get_blending_query_key(query.time_dimensions.as_ref())?; - let response_key = get_blending_response_key(query.time_dimensions.as_ref())?; - - if let Some(value) = row.get(&response_key) { - row.insert(blending_key, value.clone()); + VanillaTail::Blending { + blending_key, + response_key, + } => { + if let Some(value) = row.get::(response_key) { + row.insert(Arc::clone(blending_key), value.clone()); } } - _ => {} } Ok(row) @@ -807,7 +992,7 @@ pub enum TransformedData { members: Vec, columns: Vec>, }, - Vanilla(Vec>), + Vanilla(Vec), } impl TransformedData { @@ -865,11 +1050,12 @@ impl TransformedData { alias_to_member_name_map, annotation, query, + query_type, )?; let dataset: Vec<_> = cube_store_result .rows .iter() - .map(|row| get_vanilla_row(&plan, query_type, query, row)) + .map(|row| get_vanilla_row(&plan, row)) .collect::>>()?; Ok(TransformedData::Vanilla(dataset)) } @@ -3115,18 +3301,19 @@ mod tests { alias_to_member_name_map, annotation, &query, + query_type, )?; - let res = get_vanilla_row(&plan, query_type, &query, &raw_data.rows[0])?; - let expected = IndexMap::from([ - ( - "ECommerceRecordsUs2021.city".to_string(), - DBResponsePrimitive::String("Missouri City".to_string()), - ), - ( - "ECommerceRecordsUs2021.avg_discount".to_string(), - DBResponsePrimitive::String("0.80000000000000000000".to_string()), - ), - ]); + let res = get_vanilla_row(&plan, &raw_data.rows[0])?; + + let mut expected: VanillaRow = empty_vanilla_row(2); + expected.insert( + Arc::new(InternedKey::new("ECommerceRecordsUs2021.city")), + DBResponsePrimitive::String("Missouri City".to_string()), + ); + expected.insert( + Arc::new(InternedKey::new("ECommerceRecordsUs2021.avg_discount")), + DBResponsePrimitive::String("0.80000000000000000000".to_string()), + ); assert_eq!(res, expected); Ok(()) } @@ -3145,12 +3332,14 @@ mod tests { let alias_to_member_name_map = &test_data.request.alias_to_member_name_map; let annotation = &test_data.request.annotation; let query = test_data.request.query.clone(); + let query_type = &test_data.request.query_type.clone().unwrap_or_default(); match build_vanilla_plan( &raw_data.columns_pos, alias_to_member_name_map, annotation, &query, + query_type, ) { Ok(_) => Err(TestError("build_vanilla_plan() should fail ".to_string()).into()), Err(err) => { @@ -3174,12 +3363,14 @@ mod tests { let alias_to_member_name_map = &test_data.request.alias_to_member_name_map; let annotation = &test_data.request.annotation; let query = test_data.request.query.clone(); + let query_type = &test_data.request.query_type.clone().unwrap_or_default(); match build_vanilla_plan( &raw_data.columns_pos, alias_to_member_name_map, annotation, &query, + query_type, ) { Ok(_) => Err(TestError("build_vanilla_plan() should fail ".to_string()).into()), Err(err) => { @@ -3341,4 +3532,124 @@ mod tests { .expect("should produce a track"); assert_eq!(track.base_member, "Cube.orderDate"); } + + fn make_config_item(member_type: &str) -> ConfigItem { + ConfigItem { + title: None, + short_title: None, + description: None, + member_type: Some(member_type.to_string()), + format: None, + currency: None, + meta: None, + drill_members: None, + drill_members_grouped: None, + granularities: None, + granularity: None, + } + } + + /// Two granularity columns share the same base time dim. When the finer + /// candidate's value is missing from the row, the bare `{cube}.{dim}` key + /// must fall back to the coarser candidate — same behavior as the previous + /// per-row HashMap, which only considered columns whose value was present. + #[test] + fn test_get_vanilla_row_minimal_granularity_falls_back_when_finer_missing() -> Result<()> { + let mut columns_pos: IndexMap = IndexMap::new(); + columns_pos.insert("t_day".to_string(), 2); // out of range in the row below + columns_pos.insert("t_month".to_string(), 0); + columns_pos.insert("city".to_string(), 1); + + let mut alias_to_member_name_map: HashMap = HashMap::new(); + alias_to_member_name_map.insert("t_day".to_string(), "Cube.t.day".to_string()); + alias_to_member_name_map.insert("t_month".to_string(), "Cube.t.month".to_string()); + alias_to_member_name_map.insert("city".to_string(), "Cube.city".to_string()); + + let mut annotation: HashMap = HashMap::new(); + annotation.insert("Cube.t.day".to_string(), make_config_item("time")); + annotation.insert("Cube.t.month".to_string(), make_config_item("time")); + annotation.insert("Cube.city".to_string(), make_config_item("string")); + + let query = make_query_with_dims(None); + let plan = build_vanilla_plan( + &columns_pos, + &alias_to_member_name_map, + &annotation, + &query, + &QueryType::RegularQuery, + )?; + + // Row only has two cells, so column_index 2 (t_day) yields None. + let db_row = vec![ + DBResponsePrimitive::String("2024-06-01T00:00:00.000".to_string()), + DBResponsePrimitive::String("Missouri City".to_string()), + ]; + let res = get_vanilla_row(&plan, &db_row)?; + + let month_transformed = transform_value( + DBResponsePrimitive::String("2024-06-01T00:00:00.000".to_string()), + "time", + ); + assert_eq!( + res.get(&InternedKey::new("Cube.t.month")), + Some(&month_transformed) + ); + assert_eq!( + res.get(&InternedKey::new("Cube.t.day")), + None, + "missing column stays absent" + ); + assert_eq!( + res.get(&InternedKey::new("Cube.city")), + Some(&DBResponsePrimitive::String("Missouri City".to_string())) + ); + assert_eq!( + res.get(&InternedKey::new("Cube.t")), + Some(&month_transformed), + "bare base key must fall back to the coarser present candidate" + ); + Ok(()) + } + + /// When all candidates are present, the bare key picks the finest level. + #[test] + fn test_get_vanilla_row_minimal_granularity_picks_finest_when_all_present() -> Result<()> { + let mut columns_pos: IndexMap = IndexMap::new(); + columns_pos.insert("t_day".to_string(), 0); + columns_pos.insert("t_month".to_string(), 1); + + let mut alias_to_member_name_map: HashMap = HashMap::new(); + alias_to_member_name_map.insert("t_day".to_string(), "Cube.t.day".to_string()); + alias_to_member_name_map.insert("t_month".to_string(), "Cube.t.month".to_string()); + + let mut annotation: HashMap = HashMap::new(); + annotation.insert("Cube.t.day".to_string(), make_config_item("time")); + annotation.insert("Cube.t.month".to_string(), make_config_item("time")); + + let query = make_query_with_dims(None); + let plan = build_vanilla_plan( + &columns_pos, + &alias_to_member_name_map, + &annotation, + &query, + &QueryType::RegularQuery, + )?; + + let db_row = vec![ + DBResponsePrimitive::String("2024-06-15T00:00:00.000".to_string()), + DBResponsePrimitive::String("2024-06-01T00:00:00.000".to_string()), + ]; + let res = get_vanilla_row(&plan, &db_row)?; + + let day_transformed = transform_value( + DBResponsePrimitive::String("2024-06-15T00:00:00.000".to_string()), + "time", + ); + assert_eq!( + res.get(&InternedKey::new("Cube.t")), + Some(&day_transformed), + "bare base key must use the finest (day) candidate, not month" + ); + Ok(()) + } } diff --git a/rust/cubesql/cubesql/src/compile/engine/df/wrapper.rs b/rust/cubesql/cubesql/src/compile/engine/df/wrapper.rs index 9085596a9031f..c30614fb78553 100644 --- a/rust/cubesql/cubesql/src/compile/engine/df/wrapper.rs +++ b/rust/cubesql/cubesql/src/compile/engine/df/wrapper.rs @@ -28,7 +28,7 @@ use datafusion::{ error::{DataFusionError, Result}, logical_plan::{ plan::Extension, replace_col, Column, DFSchema, DFSchemaRef, Expr, GroupingSet, JoinType, - LogicalPlan, UserDefinedLogicalNode, + LogicalPlan, Operator, UserDefinedLogicalNode, }, physical_plan::{aggregates::AggregateFunction, functions::BuiltinScalarFunction}, scalar::ScalarValue, @@ -1859,15 +1859,42 @@ impl WrappedSelectNode { subqueries, ) .await?; - let resulting_sql = sql_generator - .get_sql_templates() - .binary_expr(left, op.to_string(), right) - .map_err(|e| { - DataFusionError::Internal(format!( - "Can't generate SQL for binary expr: {}", - e - )) - })?; + let resulting_sql = match op { + Operator::Like => sql_generator.get_sql_templates().like_expr( + LikeType::Like, + left, + false, + right, + None, + ), + Operator::NotLike => sql_generator.get_sql_templates().like_expr( + LikeType::Like, + left, + true, + right, + None, + ), + Operator::ILike => sql_generator.get_sql_templates().like_expr( + LikeType::ILike, + left, + false, + right, + None, + ), + Operator::NotILike => sql_generator.get_sql_templates().like_expr( + LikeType::ILike, + left, + true, + right, + None, + ), + _ => sql_generator + .get_sql_templates() + .binary_expr(left, op.to_string(), right), + } + .map_err(|e| { + DataFusionError::Internal(format!("Can't generate SQL for binary expr: {}", e)) + })?; Ok((resulting_sql, sql_query)) } // Expr::AnyExpr { .. } => {} diff --git a/rust/cubesql/cubesql/src/compile/mod.rs b/rust/cubesql/cubesql/src/compile/mod.rs index 192080a2c8d11..2e7f84885c7f1 100644 --- a/rust/cubesql/cubesql/src/compile/mod.rs +++ b/rust/cubesql/cubesql/src/compile/mod.rs @@ -3389,6 +3389,43 @@ limit }]), None, ), + // LIKE with `\` as escape character: `\_` and `\%` must be + // resolved to literal characters, not preserved in the filter + // value, and the resulting filter must use `equals` since the + // unescaped pattern contains no wildcards. + ( + r"customer_gender LIKE 'fem\_ale'".to_string(), + Some(vec![V1LoadRequestQueryFilterItem { + member: Some("KibanaSampleDataEcommerce.customer_gender".to_string()), + operator: Some("equals".to_string()), + values: Some(vec!["fem_ale".to_string()]), + or: None, + and: None, + }]), + None, + ), + ( + r"customer_gender LIKE 'fem\%ale%'".to_string(), + Some(vec![V1LoadRequestQueryFilterItem { + member: Some("KibanaSampleDataEcommerce.customer_gender".to_string()), + operator: Some("startsWith".to_string()), + values: Some(vec!["fem%ale".to_string()]), + or: None, + and: None, + }]), + None, + ), + ( + r"customer_gender NOT LIKE '%fem\_ale'".to_string(), + Some(vec![V1LoadRequestQueryFilterItem { + member: Some("KibanaSampleDataEcommerce.customer_gender".to_string()), + operator: Some("notEndsWith".to_string()), + values: Some(vec!["fem_ale".to_string()]), + or: None, + and: None, + }]), + None, + ), // Segment ( "is_male = true".to_string(), @@ -11397,6 +11434,75 @@ ORDER BY "source"."str0" ASC ) } + #[tokio::test] + async fn test_cube_scan_like_escaped_chars_filter() { + init_testing_logger(); + + let logical_plan = convert_select_to_query_plan( + r#" + SELECT customer_gender + FROM "public"."KibanaSampleDataEcommerce" + WHERE customer_gender LIKE '%fem\_ale%' + GROUP BY 1 + "# + .to_string(), + DatabaseProtocol::PostgreSQL, + ) + .await + .as_logical_plan(); + + assert_eq!( + logical_plan.find_cube_scan().request, + V1LoadRequestQuery { + measures: Some(vec![]), + dimensions: Some(vec!["KibanaSampleDataEcommerce.customer_gender".to_string()]), + segments: Some(vec![]), + order: Some(vec![]), + filters: Some(vec![V1LoadRequestQueryFilterItem { + member: Some("KibanaSampleDataEcommerce.customer_gender".to_string()), + operator: Some("contains".to_string()), + values: Some(vec!["fem_ale".to_string()]), + or: None, + and: None, + }]), + ..Default::default() + } + ); + } + + #[tokio::test] + async fn test_sql_push_down_like_with_escape() { + if !Rewriter::sql_push_down_enabled() { + return; + } + init_testing_logger(); + + let query_plan = convert_select_to_query_plan_customized( + r#" + SELECT customer_gender + FROM "public"."KibanaSampleDataEcommerce" + WHERE + LOWER(customer_gender) <> 'a' + AND customer_gender LIKE 'foo\%bar' + GROUP BY 1 + "# + .to_string(), + DatabaseProtocol::PostgreSQL, + vec![( + "expressions/like".to_string(), + "{{ expr }} {% if negated %}NOT {% endif %}LIKE {{ pattern }}\ + {% if default_escape %} ESCAPE '\\\\'{% endif %}" + .to_string(), + )], + ) + .await; + + let logical_plan = query_plan.as_logical_plan(); + let sql = logical_plan.find_cube_scan_wrapped_sql().wrapped_sql.sql; + assert!(sql.contains(" LIKE ")); + assert!(sql.contains(" ESCAPE ")); + } + #[tokio::test] async fn test_thoughtspot_exclude_single_filter() { if !Rewriter::sql_push_down_enabled() { diff --git a/rust/cubesql/cubesql/src/compile/rewrite/rules/filters.rs b/rust/cubesql/cubesql/src/compile/rewrite/rules/filters.rs index d201a6bfb3909..3e182c58c9d20 100644 --- a/rust/cubesql/cubesql/src/compile/rewrite/rules/filters.rs +++ b/rust/cubesql/cubesql/src/compile/rewrite/rules/filters.rs @@ -64,6 +64,87 @@ pub struct FilterRules { eval_stable_functions: bool, } +/// Shape of a SQL LIKE pattern once escape sequences have been resolved. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum LikePatternShape { + /// No wildcards: matches the literal exactly. + Equals, + /// Trailing `%` wildcard only. + StartsWith, + /// Leading `%` wildcard only. + EndsWith, + /// Both leading and trailing `%` wildcards. + Contains, +} + +/// Parse a SQL LIKE pattern using `escape_char` as the escape character. +/// +/// Returns the pattern shape and its unescaped literal portion (with `\%`, +/// `\_`, `\\` resolved to `%`, `_`, `\`), or `None` if the pattern contains +/// an internal wildcard — i.e. an unescaped `_`, or an unescaped `%` that is +/// not the very first or very last token — since such patterns cannot be +/// represented by Cube's equals/contains/startsWith/endsWith filter operators. +fn parse_like_pattern(pattern: &str, escape_char: char) -> Option<(LikePatternShape, String)> { + enum Token { + Literal(char), + Percent, + Underscore, + } + + let mut tokens: Vec = Vec::with_capacity(pattern.len()); + let mut chars = pattern.chars().peekable(); + while let Some(c) = chars.next() { + if c == escape_char { + match chars.peek().copied() { + Some(next) if next == '%' || next == '_' || next == escape_char => { + chars.next(); + tokens.push(Token::Literal(next)); + } + _ => { + // Escape char with no following %/_/escape — keep the + // escape char itself as a literal so input semantics are + // preserved rather than silently dropped. + tokens.push(Token::Literal(c)); + } + } + continue; + } + match c { + '%' => tokens.push(Token::Percent), + '_' => tokens.push(Token::Underscore), + _ => tokens.push(Token::Literal(c)), + } + } + + let starts_with_percent = matches!(tokens.first(), Some(Token::Percent)); + let ends_with_percent = matches!(tokens.last(), Some(Token::Percent)); + + let start = if starts_with_percent { 1 } else { 0 }; + let end = if ends_with_percent { + tokens.len() - 1 + } else { + tokens.len() + }; + + let mut literal = String::new(); + if end > start { + for tok in &tokens[start..end] { + match tok { + Token::Literal(c) => literal.push(*c), + Token::Percent | Token::Underscore => return None, + } + } + } + + let shape = match (starts_with_percent, ends_with_percent) { + (false, false) => LikePatternShape::Equals, + (false, true) => LikePatternShape::StartsWith, + (true, false) => LikePatternShape::EndsWith, + (true, true) => LikePatternShape::Contains, + }; + Some((shape, literal)) +} + impl FilterRules { fn inlist_expr_list(&self, exprs: Vec) -> String { inlist_expr_list(exprs, self.config_obj.push_down_pull_up_split()) @@ -3660,73 +3741,60 @@ impl FilterRules { }, }; - let op = match literal { - ScalarValue::Utf8(Some(value)) => match op { - "contains" => { - let starts_with_pcnt = value.starts_with("%"); - let ends_with_pcnt = value.ends_with("%"); - match (starts_with_pcnt, ends_with_pcnt) { - (false, false) => "equals", - (false, true) => "startsWith", - (true, false) => "endsWith", - (true, true) => "contains", - } - } - "notContains" => { - let starts_with_pcnt = value.starts_with("%"); - let ends_with_pcnt = value.ends_with("%"); - match (starts_with_pcnt, ends_with_pcnt) { - (false, false) => "notEquals", - (false, true) => "notStartsWith", - (true, false) => "notEndsWith", - (true, true) => "notContains", - } - } - _ => op, - }, - _ => op, + // LIKE-family operators are handled + // separately so that `\%`, `\_`, `\\` + // escape sequences in the pattern are + // resolved before the literal is placed + // into the CubeScan filter value, and so + // that patterns with internal wildcards + // are not silently misrepresented as + // equals/contains/startsWith/endsWith. + let is_like_op = matches!( + expr_op, + Operator::Like + | Operator::ILike + | Operator::NotLike + | Operator::NotILike, + ); + + let (op, like_value): (&str, Option) = if is_like_op { + let value = match literal { + ScalarValue::Utf8(Some(value)) => value, + _ => continue, + }; + let Some((shape, unescaped)) = + parse_like_pattern(value, '\\') + else { + continue; + }; + let negated = matches!( + expr_op, + Operator::NotLike | Operator::NotILike, + ); + let op = match (shape, negated) { + (LikePatternShape::Equals, false) => "equals", + (LikePatternShape::StartsWith, false) => "startsWith", + (LikePatternShape::EndsWith, false) => "endsWith", + (LikePatternShape::Contains, false) => "contains", + (LikePatternShape::Equals, true) => "notEquals", + (LikePatternShape::StartsWith, true) => "notStartsWith", + (LikePatternShape::EndsWith, true) => "notEndsWith", + (LikePatternShape::Contains, true) => "notContains", + }; + (op, Some(unescaped)) + } else { + (op, None) }; let values = match literal { ScalarValue::Utf8(Some(value)) => vec![{ - if op == "startsWith" + if let Some(like_value) = like_value { + like_value + } else if op == "startsWith" && value.starts_with("^^") && value.ends_with(".*$") { value[2..value.len() - 3].to_string() - } else if op == "contains" || op == "notContains" { - if value.starts_with("%") && value.ends_with("%") { - let without_wildcard = - value[1..value.len() - 1].to_string(); - if without_wildcard.contains("%") { - continue; - } - without_wildcard - } else { - value.to_string() - } - } else if op == "startsWith" || op == "notStartsWith" { - if value.ends_with("%") { - let without_wildcard = - value[..value.len() - 1].to_string(); - if without_wildcard.contains("%") { - continue; - } - without_wildcard - } else { - value.to_string() - } - } else if op == "endsWith" || op == "notEndsWith" { - if let Some(without_wildcard) = - value.strip_prefix("%") - { - if without_wildcard.contains("%") { - continue; - } - without_wildcard.to_string() - } else { - value.to_string() - } } else { value.to_string() } diff --git a/rust/cubesql/cubesql/src/transport/service.rs b/rust/cubesql/cubesql/src/transport/service.rs index 0b720c0ff11f6..fb95c830b1fbd 100644 --- a/rust/cubesql/cubesql/src/transport/service.rs +++ b/rust/cubesql/cubesql/src/transport/service.rs @@ -912,7 +912,12 @@ impl SqlTemplates { let rendered_like = self.render_template( &format!("expressions/{}", expression_name), - context! { expr => expr, negated => negated, pattern => pattern }, + context! { + expr => expr, + negated => negated, + pattern => pattern, + default_escape => escape_char.is_none(), + }, )?; let Some(escape_char) = escape_char else {