diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 07ef358d0396..25cb2d001c46 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -889,7 +889,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade InputFormatPtr input_format; if (context_->getSettingsRef()[Setting::use_parquet_metadata_cache] && use_native_reader_v3 - && (object_info->getFileFormat().value_or(configuration->getFormat()) == "Parquet") + && (Poco::toLower(object_info->getFileFormat().value_or(configuration->getFormat())) == "parquet") && !object_info->getObjectMetadata()->etag.empty()) { std::optional object_with_metadata = object_info->relative_path_with_metadata; diff --git a/tests/integration/test_storage_iceberg_with_spark/test_read_constant_columns_optimization.py b/tests/integration/test_storage_iceberg_with_spark/test_read_constant_columns_optimization.py index ae9044e86e52..58ac641f0074 100644 --- a/tests/integration/test_storage_iceberg_with_spark/test_read_constant_columns_optimization.py +++ b/tests/integration/test_storage_iceberg_with_spark/test_read_constant_columns_optimization.py @@ -169,6 +169,18 @@ def execute_spark_query(query: str): for replica in started_cluster_iceberg_with_spark.instances.values(): replica.query("SYSTEM FLUSH LOGS") + # Number of object-get requests per data file that are NOT served from caches + # after the warmup query above. The parquet metadata cache (enabled by default) + # caches the parquet footer keyed by the object's etag; the warmup query then + # populates it, so any subsequent read of the same file skips one object-get + # (the footer read). However, AzureObjectStorage::getObjectMetadata does NOT + # populate etag, so the cache guard `!etag.empty()` in + # StorageObjectStorageSource::createReader always fails for Azure, and the + # cache path is never taken there. As a result the multiplier is: + # S3: 2 (footer served from cache, data-only gets remain) + # Azure: 3 (cache never engaged, footer + data gets) + per_file_gets = 2 if storage_type == "s3" else 3 + def check_events(query_id, event, is_cluster, expected): res = instance.query( f"""