From 1971ef954cd05ecfcd12f37d2ecd6224ea6e9781 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Tue, 30 Sep 2025 16:50:35 +0800
Subject: [PATCH 01/32] update

---
 Cargo.lock                                    |   51 +-
 Cargo.toml                                    |    2 +
 crates/sail-data-source/Cargo.toml            |    1 +
 .../sail-data-source/src/formats/iceberg.rs   |   45 +
 crates/sail-data-source/src/formats/mod.rs    |    1 +
 crates/sail-data-source/src/registry.rs       |    2 +
 crates/sail-iceberg/Cargo.toml                |   43 +
 crates/sail-iceberg/src/arrow_conversion.rs   |  195 +++
 crates/sail-iceberg/src/datasource/mod.rs     |    3 +
 .../sail-iceberg/src/datasource/provider.rs   | 1042 +++++++++++++++++
 crates/sail-iceberg/src/lib.rs                |    9 +
 crates/sail-iceberg/src/spec/datatypes.rs     |  823 +++++++++++++
 crates/sail-iceberg/src/spec/manifest.rs      |  447 +++++++
 crates/sail-iceberg/src/spec/manifest_list.rs |  308 +++++
 crates/sail-iceberg/src/spec/mod.rs           |   19 +
 crates/sail-iceberg/src/spec/partition.rs     |  211 ++++
 crates/sail-iceberg/src/spec/schema.rs        |  377 ++++++
 crates/sail-iceberg/src/spec/snapshot.rs      |  249 ++++
 .../sail-iceberg/src/spec/table_metadata.rs   |  161 +++
 crates/sail-iceberg/src/spec/transform.rs     |  212 ++++
 crates/sail-iceberg/src/spec/values.rs        |   57 +
 crates/sail-iceberg/src/table_format.rs       |  206 ++++
 pyproject.toml                                |    4 +
 python/pysail/tests/spark/iceberg/__init__.py |    0
 .../tests/spark/iceberg/test_iceberg_io.py    |  172 +++
 25 files changed, 4638 insertions(+), 2 deletions(-)
 create mode 100644 crates/sail-data-source/src/formats/iceberg.rs
 create mode 100644 crates/sail-iceberg/Cargo.toml
 create mode 100644 crates/sail-iceberg/src/arrow_conversion.rs
 create mode 100644 crates/sail-iceberg/src/datasource/mod.rs
 create mode 100644 crates/sail-iceberg/src/datasource/provider.rs
 create mode 100644 crates/sail-iceberg/src/lib.rs
 create mode 100644 crates/sail-iceberg/src/spec/datatypes.rs
 create mode 100644 crates/sail-iceberg/src/spec/manifest.rs
 create mode 100644 crates/sail-iceberg/src/spec/manifest_list.rs
 create mode 100644 crates/sail-iceberg/src/spec/mod.rs
 create mode 100644 crates/sail-iceberg/src/spec/partition.rs
 create mode 100644 crates/sail-iceberg/src/spec/schema.rs
 create mode 100644 crates/sail-iceberg/src/spec/snapshot.rs
 create mode 100644 crates/sail-iceberg/src/spec/table_metadata.rs
 create mode 100644 crates/sail-iceberg/src/spec/transform.rs
 create mode 100644 crates/sail-iceberg/src/spec/values.rs
 create mode 100644 crates/sail-iceberg/src/table_format.rs
 create mode 100644 python/pysail/tests/spark/iceberg/__init__.py
 create mode 100644 python/pysail/tests/spark/iceberg/test_iceberg_io.py

diff --git a/Cargo.lock b/Cargo.lock
index a833356162..065f958ea0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4480,6 +4480,17 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "ordered-float"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951"
+dependencies = [
+ "num-traits",
+ "rand 0.8.5",
+ "serde",
+]
+
 [[package]]
 name = "outref"
 version = "0.5.2"
@@ -5145,6 +5156,7 @@ dependencies = [
  "libc",
  "rand_chacha 0.3.1",
  "rand_core 0.6.4",
+ "serde",
 ]
 
 [[package]]
@@ -5184,6 +5196,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
 dependencies = [
  "getrandom 0.2.16",
+ "serde",
 ]
 
 [[package]]
@@ -5618,6 +5631,7 @@ dependencies = [
  "sail-common",
  "sail-common-datafusion",
  "sail-delta-lake",
+ "sail-iceberg",
  "serde",
  "serde_yaml",
  "syn",
@@ -5736,6 +5750,39 @@ dependencies = [
  "serde_json",
 ]
 
+[[package]]
+name = "sail-iceberg"
+version = "0.3.5"
+dependencies = [
+ "apache-avro",
+ "arrow-schema",
+ "async-trait",
+ "base64 0.22.1",
+ "bytes",
+ "chrono",
+ "datafusion",
+ "datafusion-common",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-adapter",
+ "futures",
+ "indexmap",
+ "itertools 0.14.0",
+ "log",
+ "num-bigint",
+ "num-traits",
+ "object_store",
+ "once_cell",
+ "ordered-float 4.6.0",
+ "parquet",
+ "percent-encoding",
+ "sail-common-datafusion",
+ "serde",
+ "serde_json",
+ "tokio",
+ "url",
+ "uuid",
+]
+
 [[package]]
 name = "sail-logical-optimizer"
 version = "0.3.5"
@@ -6108,7 +6155,7 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f3a1a3341211875ef120e117ea7fd5228530ae7e7036a779fdc9117be6b3282c"
 dependencies = [
- "ordered-float",
+ "ordered-float 2.10.1",
  "serde",
 ]
 
@@ -6541,7 +6588,7 @@ checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
 dependencies = [
  "byteorder",
  "integer-encoding",
- "ordered-float",
+ "ordered-float 2.10.1",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index dcb37fef37..ece580545c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -100,6 +100,8 @@ moka = { version = "0.12.10", features = ["sync"] }
 bytes = "1.10.1"
 indexmap = "2.11.0"
 pin-project-lite = "0.2.16"
+ordered-float = { version = "4.5.0", features = ["serde"] }
+apache-avro = { version = "0.20.0" }
 
 ######
 # The versions of the following dependencies are managed manually.
diff --git a/crates/sail-data-source/Cargo.toml b/crates/sail-data-source/Cargo.toml
index 95d35ddb38..64e7f57280 100644
--- a/crates/sail-data-source/Cargo.toml
+++ b/crates/sail-data-source/Cargo.toml
@@ -10,6 +10,7 @@ workspace = true
 sail-common = { path = "../sail-common" }
 sail-common-datafusion = { path = "../sail-common-datafusion" }
 sail-delta-lake = { path = "../sail-delta-lake" }
+sail-iceberg = { path = "../sail-iceberg" }
 
 async-trait = { workspace = true }
 serde = { workspace = true }
diff --git a/crates/sail-data-source/src/formats/iceberg.rs b/crates/sail-data-source/src/formats/iceberg.rs
new file mode 100644
index 0000000000..4131e2cbab
--- /dev/null
+++ b/crates/sail-data-source/src/formats/iceberg.rs
@@ -0,0 +1,45 @@
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use datafusion::catalog::{Session, TableProvider};
+use datafusion::common::Result;
+use datafusion::physical_plan::ExecutionPlan;
+use sail_common_datafusion::datasource::{SinkInfo, SourceInfo, TableFormat};
+use sail_iceberg::IcebergTableFormat;
+
+/// Iceberg table format implementation that delegates to sail-iceberg
+#[derive(Debug)]
+pub struct IcebergDataSourceFormat {
+    inner: IcebergTableFormat,
+}
+
+impl Default for IcebergDataSourceFormat {
+    fn default() -> Self {
+        Self {
+            inner: IcebergTableFormat,
+        }
+    }
+}
+
+#[async_trait]
+impl TableFormat for IcebergDataSourceFormat {
+    fn name(&self) -> &str {
+        self.inner.name()
+    }
+
+    async fn create_provider(
+        &self,
+        ctx: &dyn Session,
+        info: SourceInfo,
+    ) -> Result<Arc<dyn TableProvider>> {
+        self.inner.create_provider(ctx, info).await
+    }
+
+    async fn create_writer(
+        &self,
+        ctx: &dyn Session,
+        info: SinkInfo,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        self.inner.create_writer(ctx, info).await
+    }
+}
diff --git a/crates/sail-data-source/src/formats/mod.rs b/crates/sail-data-source/src/formats/mod.rs
index 1b02dec6e1..2998e8a1bd 100644
--- a/crates/sail-data-source/src/formats/mod.rs
+++ b/crates/sail-data-source/src/formats/mod.rs
@@ -4,6 +4,7 @@ pub mod binary;
 pub mod console;
 pub mod csv;
 pub mod delta;
+pub mod iceberg;
 pub mod json;
 pub mod listing;
 pub mod parquet;
diff --git a/crates/sail-data-source/src/registry.rs b/crates/sail-data-source/src/registry.rs
index c45cd66651..ced13c386b 100644
--- a/crates/sail-data-source/src/registry.rs
+++ b/crates/sail-data-source/src/registry.rs
@@ -11,6 +11,7 @@ use crate::formats::binary::BinaryTableFormat;
 use crate::formats::console::ConsoleTableFormat;
 use crate::formats::csv::CsvTableFormat;
 use crate::formats::delta::DeltaTableFormat;
+use crate::formats::iceberg::IcebergDataSourceFormat;
 use crate::formats::json::JsonTableFormat;
 use crate::formats::parquet::ParquetTableFormat;
 use crate::formats::rate::RateTableFormat;
@@ -42,6 +43,7 @@ impl TableFormatRegistry {
         registry.register_format(Arc::new(BinaryTableFormat::default()));
         registry.register_format(Arc::new(CsvTableFormat::default()));
         registry.register_format(Arc::new(DeltaTableFormat));
+        registry.register_format(Arc::new(IcebergDataSourceFormat::default()));
         registry.register_format(Arc::new(JsonTableFormat::default()));
         registry.register_format(Arc::new(ParquetTableFormat::default()));
         registry.register_format(Arc::new(TextTableFormat::default()));
diff --git a/crates/sail-iceberg/Cargo.toml b/crates/sail-iceberg/Cargo.toml
new file mode 100644
index 0000000000..e2820b0f7d
--- /dev/null
+++ b/crates/sail-iceberg/Cargo.toml
@@ -0,0 +1,43 @@
+[package]
+name = "sail-iceberg"
+version = { workspace = true }
+edition = { workspace = true }
+
+[dependencies]
+sail-common-datafusion = { path = "../sail-common-datafusion" }
+# Delta Lake
+
+# DataFusion dependencies
+datafusion = { workspace = true }
+datafusion-common = { workspace = true }
+datafusion-physical-expr-adapter = { workspace = true }
+datafusion-physical-expr = { workspace = true }
+
+# Arrow dependencies
+arrow-schema = { workspace = true }
+
+# Essential utilities
+async-trait = { workspace = true }
+object_store = { workspace = true }
+chrono = { workspace = true }
+serde_json = { workspace = true }
+url = { workspace = true }
+futures = { workspace = true }
+serde = { workspace = true, features = ["derive"] }
+tokio = { workspace = true }
+uuid = { workspace = true }
+parquet = { workspace = true }
+bytes = { workspace = true }
+indexmap = { workspace = true }
+log = { workspace = true }
+itertools = { workspace = true }
+percent-encoding = { workspace = true }
+once_cell = { workspace = true }
+ordered-float = { workspace = true }
+apache-avro = { workspace = true }
+num-bigint = { workspace = true }
+num-traits = { workspace = true }
+base64 = { workspace = true }
+
+[lints]
+workspace = true
diff --git a/crates/sail-iceberg/src/arrow_conversion.rs b/crates/sail-iceberg/src/arrow_conversion.rs
new file mode 100644
index 0000000000..7936f85311
--- /dev/null
+++ b/crates/sail-iceberg/src/arrow_conversion.rs
@@ -0,0 +1,195 @@
+use std::sync::Arc;
+
+use arrow_schema::{
+    DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, TimeUnit,
+};
+use datafusion_common::Result;
+
+use crate::spec::{NestedField, PrimitiveType, Schema, StructType, Type};
+
+/// Convert Iceberg schema to Arrow schema
+pub fn iceberg_schema_to_arrow(schema: &Schema) -> Result<ArrowSchema> {
+    let fields = schema
+        .fields()
+        .iter()
+        .map(|field| iceberg_field_to_arrow(field))
+        .collect::<Result<Vec<_>>>()?;
+
+    Ok(ArrowSchema::new(fields))
+}
+
+/// Convert Iceberg field to Arrow field
+pub fn iceberg_field_to_arrow(field: &NestedField) -> Result<ArrowField> {
+    let arrow_type = iceberg_type_to_arrow(&field.field_type)?;
+    let nullable = !field.required;
+
+    Ok(ArrowField::new(&field.name, arrow_type, nullable))
+}
+
+/// Convert Iceberg type to Arrow data type
+pub fn iceberg_type_to_arrow(iceberg_type: &Type) -> Result<ArrowDataType> {
+    match iceberg_type {
+        Type::Primitive(primitive) => iceberg_primitive_to_arrow(primitive),
+        Type::Struct(struct_type) => iceberg_struct_to_arrow(struct_type),
+        Type::List(list_type) => {
+            let element_field = iceberg_field_to_arrow(&list_type.element_field)?;
+            Ok(ArrowDataType::List(Arc::new(element_field)))
+        }
+        Type::Map(map_type) => {
+            let key_field = iceberg_field_to_arrow(&map_type.key_field)?;
+            let value_field = iceberg_field_to_arrow(&map_type.value_field)?;
+
+            // Arrow Map type expects a struct with key and value fields
+            let entries_field = ArrowField::new(
+                "entries",
+                ArrowDataType::Struct(vec![key_field, value_field].into()),
+                false, // entries field itself is not nullable
+            );
+
+            Ok(ArrowDataType::Map(Arc::new(entries_field), false))
+        }
+    }
+}
+
+/// Convert Iceberg primitive type to Arrow data type
+pub fn iceberg_primitive_to_arrow(primitive: &PrimitiveType) -> Result<ArrowDataType> {
+    let arrow_type = match primitive {
+        PrimitiveType::Boolean => ArrowDataType::Boolean,
+        PrimitiveType::Int => ArrowDataType::Int32,
+        PrimitiveType::Long => ArrowDataType::Int64,
+        PrimitiveType::Float => ArrowDataType::Float32,
+        PrimitiveType::Double => ArrowDataType::Float64,
+        PrimitiveType::Decimal { precision, scale } => {
+            ArrowDataType::Decimal128(*precision as u8, *scale as i8)
+        }
+        PrimitiveType::Date => ArrowDataType::Date32,
+        PrimitiveType::Time => ArrowDataType::Time64(TimeUnit::Microsecond),
+        PrimitiveType::Timestamp => ArrowDataType::Timestamp(TimeUnit::Microsecond, None),
+        PrimitiveType::Timestamptz => {
+            ArrowDataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into()))
+        }
+        PrimitiveType::TimestampNs => ArrowDataType::Timestamp(TimeUnit::Nanosecond, None),
+        PrimitiveType::TimestamptzNs => {
+            ArrowDataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into()))
+        }
+        PrimitiveType::String => ArrowDataType::Utf8,
+        PrimitiveType::Uuid => ArrowDataType::FixedSizeBinary(16),
+        PrimitiveType::Fixed(size) => ArrowDataType::FixedSizeBinary(*size as i32),
+        PrimitiveType::Binary => ArrowDataType::Binary,
+    };
+
+    Ok(arrow_type)
+}
+
+/// Convert Iceberg struct type to Arrow struct data type
+pub fn iceberg_struct_to_arrow(struct_type: &StructType) -> Result<ArrowDataType> {
+    let fields = struct_type
+        .fields()
+        .iter()
+        .map(|field| iceberg_field_to_arrow(field))
+        .collect::<Result<Vec<_>>>()?;
+
+    Ok(ArrowDataType::Struct(fields.into()))
+}
+
+#[cfg(test)]
+#[allow(clippy::unwrap_used, clippy::expect_used)]
+mod tests {
+    use std::sync::Arc;
+
+    use super::*;
+    use crate::spec::{NestedField, PrimitiveType, Schema, Type};
+
+    #[test]
+    fn test_primitive_type_conversion() {
+        let test_cases = vec![
+            (PrimitiveType::Boolean, ArrowDataType::Boolean),
+            (PrimitiveType::Int, ArrowDataType::Int32),
+            (PrimitiveType::Long, ArrowDataType::Int64),
+            (PrimitiveType::Float, ArrowDataType::Float32),
+            (PrimitiveType::Double, ArrowDataType::Float64),
+            (PrimitiveType::String, ArrowDataType::Utf8),
+            (PrimitiveType::Binary, ArrowDataType::Binary),
+            (PrimitiveType::Date, ArrowDataType::Date32),
+            (
+                PrimitiveType::Time,
+                ArrowDataType::Time64(TimeUnit::Microsecond),
+            ),
+            (
+                PrimitiveType::Timestamp,
+                ArrowDataType::Timestamp(TimeUnit::Microsecond, None),
+            ),
+            (
+                PrimitiveType::Timestamptz,
+                ArrowDataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())),
+            ),
+            (PrimitiveType::Uuid, ArrowDataType::FixedSizeBinary(16)),
+            (PrimitiveType::Fixed(10), ArrowDataType::FixedSizeBinary(10)),
+        ];
+
+        for (iceberg_type, expected_arrow_type) in test_cases {
+            let result = iceberg_primitive_to_arrow(&iceberg_type)
+                .expect("Failed to convert iceberg type to arrow");
+            assert_eq!(result, expected_arrow_type);
+        }
+    }
+
+    #[test]
+    fn test_decimal_type_conversion() {
+        let decimal_type = PrimitiveType::Decimal {
+            precision: 10,
+            scale: 2,
+        };
+        let result = iceberg_primitive_to_arrow(&decimal_type)
+            .expect("Failed to convert decimal type to arrow");
+        assert_eq!(result, ArrowDataType::Decimal128(10, 2));
+    }
+
+    #[test]
+    fn test_schema_conversion() {
+        let schema = Schema::builder()
+            .with_schema_id(1)
+            .with_fields(vec![
+                Arc::new(NestedField::required(
+                    1,
+                    "id",
+                    Type::Primitive(PrimitiveType::Long),
+                )),
+                Arc::new(NestedField::optional(
+                    2,
+                    "name",
+                    Type::Primitive(PrimitiveType::String),
+                )),
+                Arc::new(NestedField::required(
+                    3,
+                    "price",
+                    Type::Primitive(PrimitiveType::Decimal {
+                        precision: 10,
+                        scale: 2,
+                    }),
+                )),
+            ])
+            .build()
+            .expect("Failed to build schema");
+
+        let arrow_schema =
+            iceberg_schema_to_arrow(&schema).expect("Failed to convert schema to arrow");
+
+        assert_eq!(arrow_schema.fields().len(), 3);
+
+        let id_field = arrow_schema.field(0);
+        assert_eq!(id_field.name(), "id");
+        assert_eq!(id_field.data_type(), &ArrowDataType::Int64);
+        assert!(!id_field.is_nullable());
+
+        let name_field = arrow_schema.field(1);
+        assert_eq!(name_field.name(), "name");
+        assert_eq!(name_field.data_type(), &ArrowDataType::Utf8);
+        assert!(name_field.is_nullable());
+
+        let price_field = arrow_schema.field(2);
+        assert_eq!(price_field.name(), "price");
+        assert_eq!(price_field.data_type(), &ArrowDataType::Decimal128(10, 2));
+        assert!(!price_field.is_nullable());
+    }
+}
diff --git a/crates/sail-iceberg/src/datasource/mod.rs b/crates/sail-iceberg/src/datasource/mod.rs
new file mode 100644
index 0000000000..588b4a12ef
--- /dev/null
+++ b/crates/sail-iceberg/src/datasource/mod.rs
@@ -0,0 +1,3 @@
+pub mod provider;
+
+pub use provider::*;
diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
new file mode 100644
index 0000000000..540c368b3c
--- /dev/null
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -0,0 +1,1042 @@
+use std::any::Any;
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use apache_avro::{from_value, Reader as AvroReader};
+use arrow_schema::Schema as ArrowSchema;
+use async_trait::async_trait;
+use datafusion::catalog::memory::DataSourceExec;
+use datafusion::catalog::Session;
+use datafusion::common::scalar::ScalarValue;
+use datafusion::common::stats::{ColumnStatistics, Precision, Statistics};
+use datafusion::common::Result as DataFusionResult;
+use datafusion::config::TableParquetOptions;
+use datafusion::datasource::listing::PartitionedFile;
+use datafusion::datasource::physical_plan::{FileGroup, FileScanConfigBuilder, ParquetSource};
+use datafusion::datasource::{TableProvider, TableType};
+use datafusion::execution::object_store::ObjectStoreUrl;
+use datafusion::logical_expr::{Expr, LogicalPlan};
+use datafusion::physical_plan::ExecutionPlan;
+use object_store::path::Path as ObjectPath;
+use object_store::ObjectMeta;
+use serde::{Deserialize, Serialize};
+use url::Url;
+
+use crate::arrow_conversion::iceberg_schema_to_arrow;
+use crate::spec::{
+    DataContentType, DataFile, FieldSummary, Literal, ManifestContentType, ManifestFile,
+    ManifestList, ManifestStatus, PrimitiveLiteral, Schema, Snapshot,
+};
+
+/// Iceberg table provider for DataFusion
+#[derive(Debug)]
+pub struct IcebergTableProvider {
+    /// The table location (URI)
+    table_uri: String,
+    /// The current schema of the table
+    schema: Schema,
+    /// The current snapshot of the table
+    snapshot: Snapshot,
+    /// Arrow schema for DataFusion
+    arrow_schema: Arc<ArrowSchema>,
+}
+
+impl IcebergTableProvider {
+    /// Create a new Iceberg table provider
+    pub fn new(
+        table_uri: impl ToString,
+        schema: Schema,
+        snapshot: Snapshot,
+    ) -> DataFusionResult<Self> {
+        let table_uri_str = table_uri.to_string();
+        log::info!("[ICEBERG] Creating table provider for: {}", table_uri_str);
+
+        let arrow_schema = Arc::new(iceberg_schema_to_arrow(&schema).map_err(|e| {
+            log::error!("[ICEBERG] Failed to convert schema to Arrow: {:?}", e);
+            e
+        })?);
+
+        log::debug!(
+            "[ICEBERG] Converted schema to Arrow with {} fields",
+            arrow_schema.fields().len()
+        );
+
+        Ok(Self {
+            table_uri: table_uri_str,
+            schema,
+            snapshot,
+            arrow_schema,
+        })
+    }
+
+    /// Get the table URI
+    pub fn table_uri(&self) -> &str {
+        &self.table_uri
+    }
+
+    /// Get the Iceberg schema
+    pub fn iceberg_schema(&self) -> &Schema {
+        &self.schema
+    }
+
+    /// Get the current snapshot
+    pub fn current_snapshot(&self) -> &Snapshot {
+        &self.snapshot
+    }
+
+    /// Get object store from DataFusion session
+    fn get_object_store(
+        &self,
+        session: &dyn Session,
+    ) -> DataFusionResult<Arc<dyn object_store::ObjectStore>> {
+        let table_url = Url::parse(&self.table_uri)
+            .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
+
+        session
+            .runtime_env()
+            .object_store_registry
+            .get_store(&table_url)
+            .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))
+    }
+
+    /// Load manifest list from snapshot
+    async fn load_manifest_list(
+        &self,
+        object_store: &Arc<dyn object_store::ObjectStore>,
+    ) -> DataFusionResult<ManifestList> {
+        let manifest_list_str = self.snapshot.manifest_list();
+        log::debug!("[ICEBERG] Manifest list path: {}", manifest_list_str);
+
+        let manifest_list_path = if let Ok(url) = Url::parse(manifest_list_str) {
+            log::debug!(
+                "[ICEBERG] Parsed manifest list as URL, path: {}",
+                url.path()
+            );
+            ObjectPath::from(url.path())
+        } else {
+            ObjectPath::from(manifest_list_str)
+        };
+
+        let manifest_list_data = object_store
+            .get(&manifest_list_path)
+            .await
+            .map_err(|e| {
+                log::error!("[ICEBERG] Failed to get manifest list: {:?}", e);
+                datafusion::common::DataFusionError::External(Box::new(e))
+            })?
+            .bytes()
+            .await
+            .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
+
+        log::debug!(
+            "[ICEBERG] Read {} bytes from manifest list",
+            manifest_list_data.len()
+        );
+
+        self.parse_manifest_list(&manifest_list_data)
+    }
+
+    /// Parse manifest list from Avro bytes
+    fn parse_manifest_list(&self, data: &[u8]) -> DataFusionResult<ManifestList> {
+        log::debug!("[ICEBERG] Parsing manifest list Avro data");
+        let reader = AvroReader::new(data)
+            .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
+
+        let mut manifest_files = Vec::new();
+        for value in reader {
+            let value =
+                value.map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
+            log::trace!("[ICEBERG] Deserializing manifest file entry");
+            let manifest_file: ManifestFileAvro = from_value(&value).map_err(|e| {
+                log::error!("[ICEBERG] Failed to deserialize manifest file: {:?}", e);
+                datafusion::common::DataFusionError::External(Box::new(e))
+            })?;
+            manifest_files.push(manifest_file.into());
+        }
+
+        Ok(ManifestList::new(manifest_files))
+    }
+
+    /// Load data files from manifests
+    async fn load_data_files(
+        &self,
+        object_store: &Arc<dyn object_store::ObjectStore>,
+        manifest_list: &ManifestList,
+    ) -> DataFusionResult<Vec<DataFile>> {
+        let mut data_files = Vec::new();
+
+        for manifest_file in manifest_list.entries() {
+            // TODO: Support delete manifests
+            if manifest_file.content != ManifestContentType::Data {
+                continue;
+            }
+
+            let manifest_path_str = manifest_file.manifest_path.as_str();
+            log::debug!("[ICEBERG] Loading manifest: {}", manifest_path_str);
+
+            let manifest_path = if let Ok(url) = Url::parse(manifest_path_str) {
+                log::debug!("[ICEBERG] Parsed manifest as URL, path: {}", url.path());
+                ObjectPath::from(url.path())
+            } else {
+                ObjectPath::from(manifest_path_str)
+            };
+
+            let manifest_data = object_store
+                .get(&manifest_path)
+                .await
+                .map_err(|e| {
+                    log::error!("[ICEBERG] Failed to get manifest: {:?}", e);
+                    datafusion::common::DataFusionError::External(Box::new(e))
+                })?
+                .bytes()
+                .await
+                .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
+
+            log::debug!("[ICEBERG] Read {} bytes from manifest", manifest_data.len());
+
+            let manifest_entries = self.parse_manifest(&manifest_data)?;
+
+            // Get partition_spec_id from manifest file
+            let partition_spec_id = manifest_file.partition_spec_id;
+
+            for entry in manifest_entries {
+                // Only include added and existing files, skip deleted files
+                let status = match entry.status {
+                    0 => ManifestStatus::Existing,
+                    1 => ManifestStatus::Added,
+                    2 => ManifestStatus::Deleted,
+                    _ => ManifestStatus::Existing,
+                };
+
+                if matches!(status, ManifestStatus::Added | ManifestStatus::Existing) {
+                    // Convert DataFileAvro to DataFile with schema and partition_spec_id
+                    let data_file = entry
+                        .data_file
+                        .into_data_file(&self.schema, partition_spec_id);
+                    data_files.push(data_file);
+                }
+            }
+        }
+
+        Ok(data_files)
+    }
+
+    /// Parse manifest from Avro bytes
+    fn parse_manifest(&self, data: &[u8]) -> DataFusionResult<Vec<ManifestEntryAvro>> {
+        log::debug!("[ICEBERG] Parsing manifest Avro data");
+        let reader = AvroReader::new(data)
+            .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
+
+        let mut entries = Vec::new();
+        for value in reader {
+            let value =
+                value.map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
+            log::trace!("[ICEBERG] Deserializing data file entry");
+            let mut entry: ManifestEntryAvro = from_value(&value).map_err(|e| {
+                log::error!("[ICEBERG] Failed to deserialize data file entry: {:?}", e);
+                datafusion::common::DataFusionError::External(Box::new(e))
+            })?;
+
+            // Extract map fields from raw Avro value
+            if let apache_avro::types::Value::Record(fields) = &value {
+                for (field_name, field_value) in fields {
+                    if field_name == "data_file" {
+                        if let apache_avro::types::Value::Record(data_file_fields) = field_value {
+                            entry
+                                .data_file
+                                .extract_map_fields_from_avro(data_file_fields);
+                        }
+                    }
+                }
+            }
+
+            entries.push(entry);
+        }
+
+        log::debug!("[ICEBERG] Parsed {} entries from manifest", entries.len());
+        Ok(entries)
+    }
+
+    /// Create partitioned files for DataFusion from Iceberg data files
+    fn create_partitioned_files(
+        &self,
+        data_files: Vec<DataFile>,
+    ) -> DataFusionResult<Vec<PartitionedFile>> {
+        let mut partitioned_files = Vec::new();
+
+        let table_url = Url::parse(&self.table_uri)
+            .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
+        let table_base_path = table_url.path();
+
+        for data_file in data_files {
+            let file_path_str = data_file.file_path();
+            log::debug!("[ICEBERG] Processing data file: {}", file_path_str);
+
+            let file_path = if let Ok(url) = Url::parse(file_path_str) {
+                ObjectPath::from(url.path())
+            } else {
+                ObjectPath::from(format!(
+                    "{}{}{}",
+                    table_base_path,
+                    object_store::path::DELIMITER,
+                    file_path_str
+                ))
+            };
+
+            log::debug!("[ICEBERG] Final ObjectPath: {}", file_path);
+
+            let object_meta = ObjectMeta {
+                location: file_path,
+                last_modified: chrono::Utc::now(),
+                size: data_file.file_size_in_bytes(),
+                e_tag: None,
+                version: None,
+            };
+
+            // Convert partition values to ScalarValues
+            let partition_values = data_file
+                .partition()
+                .iter()
+                .map(|literal_opt| match literal_opt {
+                    Some(literal) => self.literal_to_scalar_value(literal),
+                    None => ScalarValue::Null,
+                })
+                .collect();
+
+            let partitioned_file = PartitionedFile {
+                object_meta,
+                partition_values,
+                range: None,
+                statistics: Some(Arc::new(self.create_file_statistics(&data_file))),
+                extensions: None,
+                metadata_size_hint: None,
+            };
+
+            partitioned_files.push(partitioned_file);
+        }
+
+        Ok(partitioned_files)
+    }
+
+    /// Create file groups from partitioned files
+    fn create_file_groups(&self, partitioned_files: Vec<PartitionedFile>) -> Vec<FileGroup> {
+        // Group files by partition values
+        let mut file_groups: HashMap<Vec<ScalarValue>, Vec<PartitionedFile>> = HashMap::new();
+
+        for file in partitioned_files {
+            file_groups
+                .entry(file.partition_values.clone())
+                .or_default()
+                .push(file);
+        }
+
+        file_groups.into_values().map(FileGroup::from).collect()
+    }
+
+    /// Convert Iceberg Literal to DataFusion ScalarValue
+    fn literal_to_scalar_value(&self, literal: &Literal) -> ScalarValue {
+        match literal {
+            Literal::Primitive(primitive) => match primitive {
+                PrimitiveLiteral::Boolean(v) => ScalarValue::Boolean(Some(*v)),
+                PrimitiveLiteral::Int(v) => ScalarValue::Int32(Some(*v)),
+                PrimitiveLiteral::Long(v) => ScalarValue::Int64(Some(*v)),
+                PrimitiveLiteral::Float(v) => ScalarValue::Float32(Some(v.into_inner())),
+                PrimitiveLiteral::Double(v) => ScalarValue::Float64(Some(v.into_inner())),
+                PrimitiveLiteral::String(v) => ScalarValue::Utf8(Some(v.clone())),
+                PrimitiveLiteral::Binary(v) => ScalarValue::Binary(Some(v.clone())),
+                PrimitiveLiteral::Int128(v) => ScalarValue::Decimal128(Some(*v), 38, 0),
+                PrimitiveLiteral::UInt128(v) => {
+                    if *v <= i128::MAX as u128 {
+                        ScalarValue::Decimal128(Some(*v as i128), 38, 0)
+                    } else {
+                        ScalarValue::Utf8(Some(v.to_string()))
+                    }
+                }
+            },
+            Literal::Struct(fields) => {
+                let json_repr = serde_json::to_string(fields).unwrap_or_default();
+                ScalarValue::Utf8(Some(json_repr))
+            }
+            Literal::List(items) => {
+                let json_repr = serde_json::to_string(items).unwrap_or_default();
+                ScalarValue::Utf8(Some(json_repr))
+            }
+            Literal::Map(pairs) => {
+                let json_repr = serde_json::to_string(pairs).unwrap_or_default();
+                ScalarValue::Utf8(Some(json_repr))
+            }
+        }
+    }
+
+    /// Create file statistics from Iceberg data file metadata
+    fn create_file_statistics(&self, data_file: &DataFile) -> Statistics {
+        let num_rows = Precision::Exact(data_file.record_count() as usize);
+        let total_byte_size = Precision::Exact(data_file.file_size_in_bytes() as usize);
+
+        // Create column statistics from Iceberg metadata
+        let column_statistics = self
+            .arrow_schema
+            .fields()
+            .iter()
+            .enumerate()
+            .map(|(i, _field)| {
+                let field_id = self
+                    .schema
+                    .fields()
+                    .get(i)
+                    .map(|f| f.id)
+                    .unwrap_or(i as i32 + 1);
+
+                let null_count = data_file
+                    .null_value_counts()
+                    .get(&field_id)
+                    .map(|&count| Precision::Exact(count as usize))
+                    .unwrap_or(Precision::Absent);
+
+                let distinct_count = Precision::Absent;
+
+                let min_value = data_file
+                    .lower_bounds()
+                    .get(&field_id)
+                    .map(|literal| self.literal_to_scalar_value(literal))
+                    .map(Precision::Exact)
+                    .unwrap_or(Precision::Absent);
+
+                let max_value = data_file
+                    .upper_bounds()
+                    .get(&field_id)
+                    .map(|literal| self.literal_to_scalar_value(literal))
+                    .map(Precision::Exact)
+                    .unwrap_or(Precision::Absent);
+
+                ColumnStatistics {
+                    null_count,
+                    max_value,
+                    min_value,
+                    distinct_count,
+                    sum_value: Precision::Absent,
+                }
+            })
+            .collect();
+
+        Statistics {
+            num_rows,
+            total_byte_size,
+            column_statistics,
+        }
+    }
+}
+
+/// Avro representation of ManifestFile for deserialization
+#[derive(Debug, Serialize, Deserialize)]
+struct ManifestFileAvro {
+    #[serde(rename = "manifest_path")]
+    manifest_path: String,
+    #[serde(rename = "manifest_length")]
+    manifest_length: i64,
+    #[serde(rename = "partition_spec_id")]
+    partition_spec_id: i32,
+    #[serde(rename = "content")]
+    content: i32,
+    #[serde(rename = "sequence_number")]
+    sequence_number: i64,
+    #[serde(rename = "min_sequence_number")]
+    min_sequence_number: i64,
+    #[serde(rename = "added_snapshot_id")]
+    added_snapshot_id: i64,
+    #[serde(rename = "added_files_count")]
+    added_files_count: i32,
+    #[serde(rename = "existing_files_count")]
+    existing_files_count: i32,
+    #[serde(rename = "deleted_files_count")]
+    deleted_files_count: i32,
+    #[serde(rename = "added_rows_count")]
+    added_rows_count: i64,
+    #[serde(rename = "existing_rows_count")]
+    existing_rows_count: i64,
+    #[serde(rename = "deleted_rows_count")]
+    deleted_rows_count: i64,
+    #[serde(rename = "partitions")]
+    partitions: Option<Vec<FieldSummaryAvro>>,
+    #[serde(rename = "key_metadata")]
+    key_metadata: Option<Vec<u8>>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct FieldSummaryAvro {
+    #[serde(rename = "contains_null")]
+    contains_null: bool,
+    #[serde(rename = "contains_nan")]
+    contains_nan: Option<bool>,
+    #[serde(rename = "lower_bound")]
+    lower_bound: Option<Vec<u8>>,
+    #[serde(rename = "upper_bound")]
+    upper_bound: Option<Vec<u8>>,
+}
+
+impl From<ManifestFileAvro> for ManifestFile {
+    fn from(avro: ManifestFileAvro) -> Self {
+        let content = match avro.content {
+            0 => ManifestContentType::Data,
+            1 => ManifestContentType::Deletes,
+            _ => ManifestContentType::Data,
+        };
+
+        let partitions = avro.partitions.map(|summaries| {
+            summaries
+                .into_iter()
+                .map(|summary| {
+                    let lower_bound = summary
+                        .lower_bound
+                        .and_then(|bytes| String::from_utf8(bytes).ok())
+                        .map(|s| Literal::Primitive(PrimitiveLiteral::String(s)));
+
+                    let upper_bound = summary
+                        .upper_bound
+                        .and_then(|bytes| String::from_utf8(bytes).ok())
+                        .map(|s| Literal::Primitive(PrimitiveLiteral::String(s)));
+
+                    let mut field_summary = FieldSummary::new(summary.contains_null);
+                    if let Some(contains_nan) = summary.contains_nan {
+                        field_summary = field_summary.with_contains_nan(contains_nan);
+                    }
+                    if let Some(lower) = lower_bound {
+                        field_summary = field_summary.with_lower_bound(lower);
+                    }
+                    if let Some(upper) = upper_bound {
+                        field_summary = field_summary.with_upper_bound(upper);
+                    }
+                    field_summary
+                })
+                .collect()
+        });
+
+        ManifestFile {
+            manifest_path: avro.manifest_path,
+            manifest_length: avro.manifest_length,
+            partition_spec_id: avro.partition_spec_id,
+            content,
+            sequence_number: avro.sequence_number,
+            min_sequence_number: avro.min_sequence_number,
+            added_snapshot_id: avro.added_snapshot_id,
+            added_files_count: avro.added_files_count,
+            existing_files_count: avro.existing_files_count,
+            deleted_files_count: avro.deleted_files_count,
+            added_rows_count: avro.added_rows_count,
+            existing_rows_count: avro.existing_rows_count,
+            deleted_rows_count: avro.deleted_rows_count,
+            partitions,
+            key_metadata: avro.key_metadata,
+        }
+    }
+}
+
+/// Parse Avro map format (array of {key, value} objects) to HashMap for i64 values
+fn parse_i64_map_from_avro(values: &Option<apache_avro::types::Value>) -> HashMap<i32, i64> {
+    use apache_avro::types::Value;
+
+    let mut map = HashMap::new();
+
+    let vec_opt = if let Some(Value::Union(_, boxed)) = values {
+        if let Value::Array(vec) = boxed.as_ref() {
+            Some(vec)
+        } else {
+            None
+        }
+    } else if let Some(Value::Array(vec)) = values {
+        Some(vec)
+    } else {
+        None
+    };
+
+    if let Some(vec) = vec_opt {
+        for item in vec {
+            if let Value::Record(fields) = item {
+                let mut key_opt = None;
+                let mut value_opt = None;
+
+                for (field_name, field_value) in fields {
+                    match field_name.as_str() {
+                        "key" => {
+                            if let Value::Int(k) = field_value {
+                                key_opt = Some(*k);
+                            }
+                        }
+                        "value" => {
+                            if let Value::Long(v) = field_value {
+                                value_opt = Some(*v);
+                            }
+                        }
+                        _ => {}
+                    }
+                }
+
+                if let (Some(key), Some(value)) = (key_opt, value_opt) {
+                    map.insert(key, value);
+                }
+            }
+        }
+    }
+
+    map
+}
+
+/// Parse Avro map format for byte arrays from Avro Values
+fn parse_bytes_map_from_avro(
+    values: &Option<apache_avro::types::Value>,
+) -> Option<HashMap<i32, Vec<u8>>> {
+    use apache_avro::types::Value;
+
+    if let Some(Value::Union(_, boxed)) = values {
+        if let Value::Array(vec) = boxed.as_ref() {
+            let mut map = HashMap::new();
+            for item in vec {
+                if let Value::Record(fields) = item {
+                    let mut key_opt = None;
+                    let mut value_opt = None;
+
+                    for (field_name, field_value) in fields {
+                        match field_name.as_str() {
+                            "key" => {
+                                if let Value::Int(k) = field_value {
+                                    key_opt = Some(*k);
+                                }
+                            }
+                            "value" => {
+                                if let Value::Bytes(b) = field_value {
+                                    value_opt = Some(b.clone());
+                                }
+                            }
+                            _ => {}
+                        }
+                    }
+
+                    if let (Some(key), Some(value)) = (key_opt, value_opt) {
+                        map.insert(key, value);
+                    }
+                }
+            }
+            return Some(map);
+        }
+    } else if let Some(Value::Array(vec)) = values {
+        let mut map = HashMap::new();
+        for item in vec {
+            if let Value::Record(fields) = item {
+                let mut key_opt = None;
+                let mut value_opt = None;
+
+                for (field_name, field_value) in fields {
+                    match field_name.as_str() {
+                        "key" => {
+                            if let Value::Int(k) = field_value {
+                                key_opt = Some(*k);
+                            }
+                        }
+                        "value" => {
+                            if let Value::Bytes(b) = field_value {
+                                value_opt = Some(b.clone());
+                            }
+                        }
+                        _ => {}
+                    }
+                }
+
+                if let (Some(key), Some(value)) = (key_opt, value_opt) {
+                    map.insert(key, value);
+                }
+            }
+        }
+        return Some(map);
+    }
+
+    None
+}
+
+/// Avro representation of ManifestEntry for deserialization
+#[derive(Debug, Serialize, Deserialize)]
+struct ManifestEntryAvro {
+    #[serde(rename = "status")]
+    status: i32,
+    #[serde(rename = "snapshot_id")]
+    snapshot_id: Option<i64>,
+    #[serde(rename = "sequence_number")]
+    sequence_number: Option<i64>,
+    #[serde(rename = "file_sequence_number")]
+    file_sequence_number: Option<i64>,
+    #[serde(rename = "data_file")]
+    data_file: DataFileAvro,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct DataFileAvro {
+    #[serde(rename = "content", default)]
+    content: i32,
+    #[serde(rename = "file_path")]
+    file_path: String,
+    #[serde(rename = "file_format")]
+    file_format: String,
+    #[serde(rename = "partition")]
+    partition: serde_json::Value,
+    #[serde(rename = "record_count")]
+    record_count: i64,
+    #[serde(rename = "file_size_in_bytes")]
+    file_size_in_bytes: i64,
+    #[serde(skip)]
+    column_sizes: Option<apache_avro::types::Value>,
+    #[serde(skip)]
+    value_counts: Option<apache_avro::types::Value>,
+    #[serde(skip)]
+    null_value_counts: Option<apache_avro::types::Value>,
+    #[serde(skip)]
+    nan_value_counts: Option<apache_avro::types::Value>,
+    #[serde(skip)]
+    lower_bounds: Option<apache_avro::types::Value>,
+    #[serde(skip)]
+    upper_bounds: Option<apache_avro::types::Value>,
+    #[serde(rename = "key_metadata")]
+    key_metadata: Option<Vec<u8>>,
+    #[serde(rename = "split_offsets")]
+    split_offsets: Option<Vec<i64>>,
+    #[serde(rename = "equality_ids")]
+    equality_ids: Option<Vec<i64>>,
+    #[serde(rename = "sort_order_id")]
+    sort_order_id: Option<i32>,
+}
+
+impl DataFileAvro {
+    /// Extract map fields from raw Avro record fields
+    fn extract_map_fields_from_avro(&mut self, fields: &[(String, apache_avro::types::Value)]) {
+        for (field_name, field_value) in fields {
+            match field_name.as_str() {
+                "column_sizes" => self.column_sizes = Some(field_value.clone()),
+                "value_counts" => self.value_counts = Some(field_value.clone()),
+                "null_value_counts" => self.null_value_counts = Some(field_value.clone()),
+                "nan_value_counts" => self.nan_value_counts = Some(field_value.clone()),
+                "lower_bounds" => self.lower_bounds = Some(field_value.clone()),
+                "upper_bounds" => self.upper_bounds = Some(field_value.clone()),
+                _ => {}
+            }
+        }
+    }
+
+    /// Convert DataFileAvro to DataFile with schema context for proper bound parsing
+    fn into_data_file(self, schema: &Schema, partition_spec_id: i32) -> DataFile {
+        let content = match self.content {
+            0 => DataContentType::Data,
+            1 => DataContentType::PositionDeletes,
+            2 => DataContentType::EqualityDeletes,
+            _ => DataContentType::Data,
+        };
+
+        let file_format = match self.file_format.to_uppercase().as_str() {
+            "PARQUET" => crate::spec::DataFileFormat::Parquet,
+            "AVRO" => crate::spec::DataFileFormat::Avro,
+            "ORC" => crate::spec::DataFileFormat::Orc,
+            _ => crate::spec::DataFileFormat::Parquet, // Default
+        };
+
+        // Parse partition values from JSON
+        let partition = parse_partition_values(Some(&self.partition));
+
+        // Parse Avro map arrays (array of {key, value} records)
+        let column_sizes = parse_i64_map_from_avro(&self.column_sizes)
+            .into_iter()
+            .map(|(k, v)| (k, v as u64))
+            .collect();
+
+        let value_counts = parse_i64_map_from_avro(&self.value_counts)
+            .into_iter()
+            .map(|(k, v)| (k, v as u64))
+            .collect();
+
+        let null_value_counts = parse_i64_map_from_avro(&self.null_value_counts)
+            .into_iter()
+            .map(|(k, v)| (k, v as u64))
+            .collect();
+
+        let nan_value_counts = parse_i64_map_from_avro(&self.nan_value_counts)
+            .into_iter()
+            .map(|(k, v)| (k, v as u64))
+            .collect();
+
+        // Parse bounds from binary data using schema for proper type conversion
+        let lower_bounds_raw = parse_bytes_map_from_avro(&self.lower_bounds);
+        let upper_bounds_raw = parse_bytes_map_from_avro(&self.upper_bounds);
+        let lower_bounds = parse_bounds_from_binary(lower_bounds_raw.as_ref(), schema);
+        let upper_bounds = parse_bounds_from_binary(upper_bounds_raw.as_ref(), schema);
+
+        DataFile {
+            content,
+            file_path: self.file_path,
+            file_format,
+            partition,
+            record_count: self.record_count as u64,
+            file_size_in_bytes: self.file_size_in_bytes as u64,
+            column_sizes,
+            value_counts,
+            null_value_counts,
+            nan_value_counts,
+            lower_bounds,
+            upper_bounds,
+            key_metadata: self.key_metadata,
+            split_offsets: self.split_offsets.unwrap_or_default(),
+            equality_ids: self
+                .equality_ids
+                .unwrap_or_default()
+                .into_iter()
+                .map(|v| v as i32)
+                .collect(),
+            sort_order_id: self.sort_order_id,
+            partition_spec_id,
+        }
+    }
+}
+
+#[async_trait]
+impl TableProvider for IcebergTableProvider {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> Arc<ArrowSchema> {
+        self.arrow_schema.clone()
+    }
+
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+
+    fn get_table_definition(&self) -> Option<&str> {
+        None
+    }
+
+    fn get_logical_plan(&self) -> Option<Cow<'_, LogicalPlan>> {
+        None
+    }
+
+    async fn scan(
+        &self,
+        session: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        limit: Option<usize>,
+    ) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
+        log::info!("[ICEBERG] Starting scan for table: {}", self.table_uri);
+
+        let object_store = self.get_object_store(session)?;
+        log::debug!("[ICEBERG] Got object store");
+
+        log::info!(
+            "[ICEBERG] Loading manifest list from: {}",
+            self.snapshot.manifest_list()
+        );
+        let manifest_list = self.load_manifest_list(&object_store).await?;
+        log::info!(
+            "[ICEBERG] Loaded {} manifest files",
+            manifest_list.entries().len()
+        );
+
+        log::info!("[ICEBERG] Loading data files from manifests...");
+        let data_files = self.load_data_files(&object_store, &manifest_list).await?;
+        log::info!("[ICEBERG] Loaded {} data files", data_files.len());
+
+        log::info!("[ICEBERG] Creating partitioned files...");
+        let partitioned_files = self.create_partitioned_files(data_files)?;
+        log::info!(
+            "[ICEBERG] Created {} partitioned files",
+            partitioned_files.len()
+        );
+
+        // Step 4: Create file groups
+        let file_groups = self.create_file_groups(partitioned_files);
+
+        // Step 5: Create file scan configuration
+        let file_schema = self.arrow_schema.clone();
+        let table_url = Url::parse(&self.table_uri)
+            .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
+
+        let base_url = format!("{}://{}", table_url.scheme(), table_url.authority());
+        let base_url_parsed = Url::parse(&base_url)
+            .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
+        let object_store_url = ObjectStoreUrl::parse(base_url_parsed)
+            .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
+
+        let parquet_options = TableParquetOptions {
+            global: session.config().options().execution.parquet.clone(),
+            ..Default::default()
+        };
+
+        let parquet_source = Arc::new(ParquetSource::new(parquet_options));
+
+        let file_scan_config =
+            FileScanConfigBuilder::new(object_store_url, file_schema, parquet_source)
+                .with_file_groups(if file_groups.is_empty() {
+                    vec![FileGroup::from(vec![])]
+                } else {
+                    file_groups
+                })
+                .with_statistics(Statistics::new_unknown(&self.arrow_schema))
+                .with_projection(projection.cloned())
+                .with_limit(limit)
+                .build();
+
+        Ok(DataSourceExec::from_data_source(file_scan_config))
+    }
+}
+
+/// Parse partition values from JSON
+fn parse_partition_values(partition_json: Option<&serde_json::Value>) -> Vec<Option<Literal>> {
+    match partition_json {
+        Some(serde_json::Value::Array(values)) => values
+            .iter()
+            .map(|value| match value {
+                serde_json::Value::Null => None,
+                serde_json::Value::Bool(b) => {
+                    Some(Literal::Primitive(PrimitiveLiteral::Boolean(*b)))
+                }
+                serde_json::Value::Number(n) => {
+                    if let Some(i) = n.as_i64() {
+                        if i >= i32::MIN as i64 && i <= i32::MAX as i64 {
+                            Some(Literal::Primitive(PrimitiveLiteral::Int(i as i32)))
+                        } else {
+                            Some(Literal::Primitive(PrimitiveLiteral::Long(i)))
+                        }
+                    } else {
+                        n.as_f64().map(|f| {
+                            Literal::Primitive(PrimitiveLiteral::Double(
+                                ordered_float::OrderedFloat(f),
+                            ))
+                        })
+                    }
+                }
+                serde_json::Value::String(s) => {
+                    Some(Literal::Primitive(PrimitiveLiteral::String(s.clone())))
+                }
+                _ => None,
+            })
+            .collect(),
+        Some(serde_json::Value::Object(_)) => {
+            vec![None]
+        }
+        _ => Vec::new(),
+    }
+}
+
+/// Parse bounds from binary data using schema field types
+fn parse_bounds_from_binary(
+    bounds_data: Option<&HashMap<i32, Vec<u8>>>,
+    schema: &Schema,
+) -> HashMap<i32, Literal> {
+    use crate::spec::Type;
+
+    let mut bounds = HashMap::new();
+
+    if let Some(data) = bounds_data {
+        for (field_id, binary_data) in data {
+            // Find the field in schema to get its type
+            if let Some(field) = schema.field_by_id(*field_id) {
+                let field_type = field.field_type.as_ref();
+
+                // Parse based on primitive type
+                let literal = match field_type {
+                    Type::Primitive(prim_type) => {
+                        parse_primitive_bound(binary_data, prim_type).ok()
+                    }
+                    _ => None,
+                };
+
+                if let Some(lit) = literal {
+                    bounds.insert(*field_id, lit);
+                }
+            } else {
+                // Fallback: if field not found, try to parse as string or binary
+                if let Ok(string_value) = String::from_utf8(binary_data.clone()) {
+                    bounds.insert(
+                        *field_id,
+                        Literal::Primitive(PrimitiveLiteral::String(string_value)),
+                    );
+                } else {
+                    bounds.insert(
+                        *field_id,
+                        Literal::Primitive(PrimitiveLiteral::Binary(binary_data.clone())),
+                    );
+                }
+            }
+        }
+    }
+
+    bounds
+}
+
+/// Parse a primitive bound value from binary data based on its type
+/// Reference: https://iceberg.apache.org/spec/#binary-single-value-serialization
+fn parse_primitive_bound(
+    bytes: &[u8],
+    prim_type: &crate::spec::PrimitiveType,
+) -> Result<Literal, String> {
+    use num_bigint::BigInt;
+    use num_traits::ToPrimitive;
+
+    use crate::spec::PrimitiveType;
+
+    let literal = match prim_type {
+        PrimitiveType::Boolean => {
+            let val = !(bytes.len() == 1 && bytes[0] == 0u8);
+            PrimitiveLiteral::Boolean(val)
+        }
+        PrimitiveType::Int | PrimitiveType::Date => {
+            let val = i32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i32 bytes")?);
+            PrimitiveLiteral::Int(val)
+        }
+        PrimitiveType::Long
+        | PrimitiveType::Time
+        | PrimitiveType::Timestamp
+        | PrimitiveType::Timestamptz
+        | PrimitiveType::TimestampNs
+        | PrimitiveType::TimestamptzNs => {
+            let val = if bytes.len() == 4 {
+                // Handle schema evolution case
+                i32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i32 bytes")?) as i64
+            } else {
+                i64::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i64 bytes")?)
+            };
+            PrimitiveLiteral::Long(val)
+        }
+        PrimitiveType::Float => {
+            let val = f32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f32 bytes")?);
+            PrimitiveLiteral::Float(ordered_float::OrderedFloat(val))
+        }
+        PrimitiveType::Double => {
+            let val = if bytes.len() == 4 {
+                // Handle schema evolution case
+                f32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f32 bytes")?) as f64
+            } else {
+                f64::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f64 bytes")?)
+            };
+            PrimitiveLiteral::Double(ordered_float::OrderedFloat(val))
+        }
+        PrimitiveType::String => {
+            let val = std::str::from_utf8(bytes)
+                .map_err(|_| "Invalid UTF-8")?
+                .to_string();
+            PrimitiveLiteral::String(val)
+        }
+        PrimitiveType::Uuid => {
+            let val = u128::from_be_bytes(bytes.try_into().map_err(|_| "Invalid UUID bytes")?);
+            PrimitiveLiteral::UInt128(val)
+        }
+        PrimitiveType::Fixed(_) | PrimitiveType::Binary => {
+            PrimitiveLiteral::Binary(Vec::from(bytes))
+        }
+        PrimitiveType::Decimal { .. } => {
+            let unscaled_value = BigInt::from_signed_bytes_be(bytes);
+            let val = unscaled_value
+                .to_i128()
+                .ok_or_else(|| format!("Can't convert bytes to i128: {:?}", bytes))?;
+            PrimitiveLiteral::Int128(val)
+        }
+    };
+
+    Ok(Literal::Primitive(literal))
+}
diff --git a/crates/sail-iceberg/src/lib.rs b/crates/sail-iceberg/src/lib.rs
new file mode 100644
index 0000000000..9c91a585b2
--- /dev/null
+++ b/crates/sail-iceberg/src/lib.rs
@@ -0,0 +1,9 @@
+pub mod arrow_conversion;
+pub mod datasource;
+pub mod spec;
+pub mod table_format;
+
+pub use arrow_conversion::*;
+pub use datasource::*;
+pub use spec::*;
+pub use table_format::*;
diff --git a/crates/sail-iceberg/src/spec/datatypes.rs b/crates/sail-iceberg/src/spec/datatypes.rs
new file mode 100644
index 0000000000..d97c21f1bb
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/datatypes.rs
@@ -0,0 +1,823 @@
+use std::collections::HashMap;
+use std::fmt;
+use std::ops::Index;
+use std::sync::{Arc, OnceLock};
+
+use serde::de::{Error, IntoDeserializer, MapAccess, Visitor};
+use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
+use serde_json::Value as JsonValue;
+
+use super::values::Literal;
+use crate::spec::PrimitiveLiteral;
+
+/// Field name for list type.
+pub const LIST_FIELD_NAME: &str = "element";
+/// Field name for map type's key.
+pub const MAP_KEY_FIELD_NAME: &str = "key";
+/// Field name for map type's value.
+pub const MAP_VALUE_FIELD_NAME: &str = "value";
+
+pub(crate) const MAX_DECIMAL_BYTES: u32 = 24;
+pub(crate) const MAX_DECIMAL_PRECISION: u32 = 38;
+
+mod _decimal {
+    use once_cell::sync::Lazy;
+
+    use crate::spec::{MAX_DECIMAL_BYTES, MAX_DECIMAL_PRECISION};
+
+    // Max precision of bytes, starts from 1
+    pub(super) static MAX_PRECISION: Lazy<[u32; MAX_DECIMAL_BYTES as usize]> = Lazy::new(|| {
+        let mut ret: [u32; 24] = [0; 24];
+        for (i, prec) in ret.iter_mut().enumerate() {
+            *prec = 2f64.powi((8 * (i + 1) - 1) as i32).log10().floor() as u32;
+        }
+
+        ret
+    });
+
+    //  Required bytes of precision, starts from 1
+    pub(super) static REQUIRED_LENGTH: Lazy<[u32; MAX_DECIMAL_PRECISION as usize]> =
+        Lazy::new(|| {
+            let mut ret: [u32; MAX_DECIMAL_PRECISION as usize] =
+                [0; MAX_DECIMAL_PRECISION as usize];
+
+            for (i, required_len) in ret.iter_mut().enumerate() {
+                for j in 0..MAX_PRECISION.len() {
+                    if MAX_PRECISION[j] >= ((i + 1) as u32) {
+                        *required_len = (j + 1) as u32;
+                        break;
+                    }
+                }
+            }
+
+            ret
+        });
+}
+
+#[derive(Debug, PartialEq, Eq, Clone)]
+/// All data types are either primitives or nested types, which are maps, lists, or structs.
+pub enum Type {
+    /// Primitive types
+    Primitive(PrimitiveType),
+    /// Struct type
+    Struct(StructType),
+    /// List type.
+    List(ListType),
+    /// Map type
+    Map(MapType),
+}
+
+impl fmt::Display for Type {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Type::Primitive(primitive) => write!(f, "{}", primitive),
+            Type::Struct(s) => write!(f, "{}", s),
+            Type::List(_) => write!(f, "list"),
+            Type::Map(_) => write!(f, "map"),
+        }
+    }
+}
+
+impl Type {
+    /// Whether the type is primitive type.
+    #[inline(always)]
+    pub fn is_primitive(&self) -> bool {
+        matches!(self, Type::Primitive(_))
+    }
+
+    /// Whether the type is struct type.
+    #[inline(always)]
+    pub fn is_struct(&self) -> bool {
+        matches!(self, Type::Struct(_))
+    }
+
+    /// Whether the type is nested type.
+    #[inline(always)]
+    pub fn is_nested(&self) -> bool {
+        matches!(self, Type::Struct(_) | Type::List(_) | Type::Map(_))
+    }
+
+    /// Convert Type to reference of PrimitiveType
+    pub fn as_primitive_type(&self) -> Option<&PrimitiveType> {
+        if let Type::Primitive(primitive_type) = self {
+            Some(primitive_type)
+        } else {
+            None
+        }
+    }
+
+    /// Convert Type to StructType
+    pub fn into_struct_type(self) -> Option<StructType> {
+        if let Type::Struct(struct_type) = self {
+            Some(struct_type)
+        } else {
+            None
+        }
+    }
+
+    /// Return max precision for decimal given [`num_bytes`] bytes.
+    #[inline(always)]
+    pub fn decimal_max_precision(num_bytes: u32) -> Result<u32, String> {
+        if num_bytes == 0 || num_bytes > MAX_DECIMAL_BYTES {
+            return Err(format!(
+                "Decimal length larger than {MAX_DECIMAL_BYTES} is not supported: {num_bytes}"
+            ));
+        }
+        Ok(_decimal::MAX_PRECISION[num_bytes as usize - 1])
+    }
+
+    /// Returns minimum bytes required for decimal with [`precision`].
+    #[inline(always)]
+    pub fn decimal_required_bytes(precision: u32) -> Result<u32, String> {
+        if precision == 0 || precision > MAX_DECIMAL_PRECISION {
+            return Err(format!(
+                "Decimals with precision larger than {MAX_DECIMAL_PRECISION} are not supported: {precision}"
+            ));
+        }
+        Ok(_decimal::REQUIRED_LENGTH[precision as usize - 1])
+    }
+
+    /// Creates  decimal type.
+    #[inline(always)]
+    pub fn decimal(precision: u32, scale: u32) -> Result<Self, String> {
+        if precision == 0 || precision > MAX_DECIMAL_PRECISION {
+            return Err(format!(
+                "Decimals with precision larger than {MAX_DECIMAL_PRECISION} are not supported: {precision}"
+            ));
+        }
+        Ok(Type::Primitive(PrimitiveType::Decimal { precision, scale }))
+    }
+
+    /// Check if it's float or double type.
+    #[inline(always)]
+    pub fn is_floating_type(&self) -> bool {
+        matches!(
+            self,
+            Type::Primitive(PrimitiveType::Float) | Type::Primitive(PrimitiveType::Double)
+        )
+    }
+}
+
+impl From<PrimitiveType> for Type {
+    fn from(value: PrimitiveType) -> Self {
+        Self::Primitive(value)
+    }
+}
+
+impl From<StructType> for Type {
+    fn from(value: StructType) -> Self {
+        Type::Struct(value)
+    }
+}
+
+impl From<ListType> for Type {
+    fn from(value: ListType) -> Self {
+        Type::List(value)
+    }
+}
+
+impl From<MapType> for Type {
+    fn from(value: MapType) -> Self {
+        Type::Map(value)
+    }
+}
+
+/// Primitive data types
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone, Hash)]
+#[serde(rename_all = "lowercase", remote = "Self")]
+pub enum PrimitiveType {
+    /// True or False
+    Boolean,
+    /// 32-bit signed integer
+    Int,
+    /// 64-bit signed integer
+    Long,
+    /// 32-bit IEEE 754 floating point.
+    Float,
+    /// 64-bit IEEE 754 floating point.
+    Double,
+    /// Fixed point decimal
+    Decimal {
+        /// Precision, must be 38 or less
+        precision: u32,
+        /// Scale
+        scale: u32,
+    },
+    /// Calendar date without timezone or time.
+    Date,
+    /// Time of day in microsecond precision, without date or timezone.
+    Time,
+    /// Timestamp in microsecond precision, without timezone
+    Timestamp,
+    /// Timestamp in microsecond precision, with timezone
+    Timestamptz,
+    /// Timestamp in nanosecond precision, without timezone
+    #[serde(rename = "timestamp_ns")]
+    TimestampNs,
+    /// Timestamp in nanosecond precision with timezone
+    #[serde(rename = "timestamptz_ns")]
+    TimestamptzNs,
+    /// Arbitrary-length character sequences encoded in utf-8
+    String,
+    /// Universally Unique Identifiers, should use 16-byte fixed
+    Uuid,
+    /// Fixed length byte array
+    Fixed(u64),
+    /// Arbitrary-length byte array.
+    Binary,
+}
+
+impl PrimitiveType {
+    /// Check whether literal is compatible with the type.
+    pub fn compatible(&self, literal: &PrimitiveLiteral) -> bool {
+        matches!(
+            (self, literal),
+            (PrimitiveType::Boolean, PrimitiveLiteral::Boolean(_))
+                | (PrimitiveType::Int, PrimitiveLiteral::Int(_))
+                | (PrimitiveType::Long, PrimitiveLiteral::Long(_))
+                | (PrimitiveType::Float, PrimitiveLiteral::Float(_))
+                | (PrimitiveType::Double, PrimitiveLiteral::Double(_))
+                | (PrimitiveType::Decimal { .. }, PrimitiveLiteral::Int128(_))
+                | (PrimitiveType::Date, PrimitiveLiteral::Int(_))
+                | (PrimitiveType::Time, PrimitiveLiteral::Long(_))
+                | (PrimitiveType::Timestamp, PrimitiveLiteral::Long(_))
+                | (PrimitiveType::Timestamptz, PrimitiveLiteral::Long(_))
+                | (PrimitiveType::TimestampNs, PrimitiveLiteral::Long(_))
+                | (PrimitiveType::TimestamptzNs, PrimitiveLiteral::Long(_))
+                | (PrimitiveType::String, PrimitiveLiteral::String(_))
+                | (PrimitiveType::Uuid, PrimitiveLiteral::UInt128(_))
+                | (PrimitiveType::Fixed(_), PrimitiveLiteral::Binary(_))
+                | (PrimitiveType::Binary, PrimitiveLiteral::Binary(_))
+        )
+    }
+}
+
+impl Serialize for Type {
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let type_serde = _serde::SerdeType::from(self);
+        type_serde.serialize(serializer)
+    }
+}
+
+impl<'de> Deserialize<'de> for Type {
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let type_serde = _serde::SerdeType::deserialize(deserializer)?;
+        Ok(Type::from(type_serde))
+    }
+}
+
+impl<'de> Deserialize<'de> for PrimitiveType {
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let s = String::deserialize(deserializer)?;
+        if s.starts_with("decimal") {
+            deserialize_decimal(s.into_deserializer())
+        } else if s.starts_with("fixed") {
+            deserialize_fixed(s.into_deserializer())
+        } else {
+            PrimitiveType::deserialize(s.into_deserializer())
+        }
+    }
+}
+
+impl Serialize for PrimitiveType {
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        match self {
+            PrimitiveType::Decimal { precision, scale } => {
+                serialize_decimal(precision, scale, serializer)
+            }
+            PrimitiveType::Fixed(l) => serialize_fixed(l, serializer),
+            _ => PrimitiveType::serialize(self, serializer),
+        }
+    }
+}
+
+fn deserialize_decimal<'de, D>(deserializer: D) -> std::result::Result<PrimitiveType, D::Error>
+where
+    D: Deserializer<'de>,
+{
+    let s = String::deserialize(deserializer)?;
+    let (precision, scale) = s
+        .trim_start_matches(r"decimal(")
+        .trim_end_matches(')')
+        .split_once(',')
+        .ok_or_else(|| D::Error::custom(format!("Decimal requires precision and scale: {s}")))?;
+
+    Ok(PrimitiveType::Decimal {
+        precision: precision.trim().parse().map_err(D::Error::custom)?,
+        scale: scale.trim().parse().map_err(D::Error::custom)?,
+    })
+}
+
+fn serialize_decimal<S>(
+    precision: &u32,
+    scale: &u32,
+    serializer: S,
+) -> std::result::Result<S::Ok, S::Error>
+where
+    S: Serializer,
+{
+    serializer.serialize_str(&format!("decimal({precision},{scale})"))
+}
+
+fn deserialize_fixed<'de, D>(deserializer: D) -> std::result::Result<PrimitiveType, D::Error>
+where
+    D: Deserializer<'de>,
+{
+    let fixed = String::deserialize(deserializer)?
+        .trim_start_matches(r"fixed[")
+        .trim_end_matches(']')
+        .to_owned();
+
+    fixed
+        .parse()
+        .map(PrimitiveType::Fixed)
+        .map_err(D::Error::custom)
+}
+
+fn serialize_fixed<S>(value: &u64, serializer: S) -> std::result::Result<S::Ok, S::Error>
+where
+    S: Serializer,
+{
+    serializer.serialize_str(&format!("fixed[{value}]"))
+}
+
+impl fmt::Display for PrimitiveType {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            PrimitiveType::Boolean => write!(f, "boolean"),
+            PrimitiveType::Int => write!(f, "int"),
+            PrimitiveType::Long => write!(f, "long"),
+            PrimitiveType::Float => write!(f, "float"),
+            PrimitiveType::Double => write!(f, "double"),
+            PrimitiveType::Decimal { precision, scale } => {
+                write!(f, "decimal({},{})", precision, scale)
+            }
+            PrimitiveType::Date => write!(f, "date"),
+            PrimitiveType::Time => write!(f, "time"),
+            PrimitiveType::Timestamp => write!(f, "timestamp"),
+            PrimitiveType::Timestamptz => write!(f, "timestamptz"),
+            PrimitiveType::TimestampNs => write!(f, "timestamp_ns"),
+            PrimitiveType::TimestamptzNs => write!(f, "timestamptz_ns"),
+            PrimitiveType::String => write!(f, "string"),
+            PrimitiveType::Uuid => write!(f, "uuid"),
+            PrimitiveType::Fixed(size) => write!(f, "fixed({})", size),
+            PrimitiveType::Binary => write!(f, "binary"),
+        }
+    }
+}
+
+/// DataType for a specific struct
+#[derive(Debug, Serialize, Clone, Default)]
+#[serde(rename = "struct", tag = "type")]
+pub struct StructType {
+    /// Struct fields
+    fields: Vec<NestedFieldRef>,
+    /// Lookup for index by field id
+    #[serde(skip_serializing)]
+    id_lookup: OnceLock<HashMap<i32, usize>>,
+    #[serde(skip_serializing)]
+    name_lookup: OnceLock<HashMap<String, usize>>,
+}
+
+impl<'de> Deserialize<'de> for StructType {
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        #[derive(Deserialize)]
+        #[serde(field_identifier, rename_all = "lowercase")]
+        enum Field {
+            Type,
+            Fields,
+        }
+
+        struct StructTypeVisitor;
+
+        impl<'de> Visitor<'de> for StructTypeVisitor {
+            type Value = StructType;
+
+            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+                formatter.write_str("struct")
+            }
+
+            fn visit_map<V>(self, mut map: V) -> std::result::Result<StructType, V::Error>
+            where
+                V: MapAccess<'de>,
+            {
+                let mut fields = None;
+                while let Some(key) = map.next_key()? {
+                    match key {
+                        Field::Type => (),
+                        Field::Fields => {
+                            if fields.is_some() {
+                                return Err(serde::de::Error::duplicate_field("fields"));
+                            }
+                            fields = Some(map.next_value()?);
+                        }
+                    }
+                }
+                let fields: Vec<NestedFieldRef> =
+                    fields.ok_or_else(|| de::Error::missing_field("fields"))?;
+
+                Ok(StructType::new(fields))
+            }
+        }
+
+        const FIELDS: &[&str] = &["type", "fields"];
+        deserializer.deserialize_struct("struct", FIELDS, StructTypeVisitor)
+    }
+}
+
+impl StructType {
+    /// Creates a struct type with the given fields.
+    pub fn new(fields: Vec<NestedFieldRef>) -> Self {
+        Self {
+            fields,
+            id_lookup: OnceLock::new(),
+            name_lookup: OnceLock::new(),
+        }
+    }
+
+    /// Get struct field with certain id
+    pub fn field_by_id(&self, id: i32) -> Option<&NestedFieldRef> {
+        self.field_id_to_index(id).map(|idx| &self.fields[idx])
+    }
+
+    fn field_id_to_index(&self, field_id: i32) -> Option<usize> {
+        self.id_lookup
+            .get_or_init(|| {
+                HashMap::from_iter(self.fields.iter().enumerate().map(|(i, x)| (x.id, i)))
+            })
+            .get(&field_id)
+            .copied()
+    }
+
+    /// Get struct field with certain field name
+    pub fn field_by_name(&self, name: &str) -> Option<&NestedFieldRef> {
+        self.field_name_to_index(name).map(|idx| &self.fields[idx])
+    }
+
+    fn field_name_to_index(&self, name: &str) -> Option<usize> {
+        self.name_lookup
+            .get_or_init(|| {
+                HashMap::from_iter(
+                    self.fields
+                        .iter()
+                        .enumerate()
+                        .map(|(i, x)| (x.name.clone(), i)),
+                )
+            })
+            .get(name)
+            .copied()
+    }
+
+    /// Get fields.
+    pub fn fields(&self) -> &[NestedFieldRef] {
+        &self.fields
+    }
+}
+
+impl PartialEq for StructType {
+    fn eq(&self, other: &Self) -> bool {
+        self.fields == other.fields
+    }
+}
+
+impl Eq for StructType {}
+
+impl Index<usize> for StructType {
+    type Output = NestedField;
+
+    fn index(&self, index: usize) -> &Self::Output {
+        &self.fields[index]
+    }
+}
+
+impl fmt::Display for StructType {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "struct<")?;
+        for field in &self.fields {
+            write!(f, "{}", field.field_type)?;
+        }
+        write!(f, ">")
+    }
+}
+
+#[derive(Debug, PartialEq, Serialize, Deserialize, Eq, Clone)]
+#[serde(from = "SerdeNestedField", into = "SerdeNestedField")]
+/// A struct is a tuple of typed values. Each field in the tuple is named and has an integer id that is unique in the table schema.
+/// Each field can be either optional or required, meaning that values can (or cannot) be null. Fields may be any type.
+/// Fields may have an optional comment or doc string. Fields can have default values.
+pub struct NestedField {
+    /// Id unique in table schema
+    pub id: i32,
+    /// Field Name
+    pub name: String,
+    /// Optional or required
+    pub required: bool,
+    /// Datatype
+    pub field_type: Box<Type>,
+    /// Fields may have an optional comment or doc string.
+    pub doc: Option<String>,
+    /// Used to populate the field's value for all records that were written before the field was added to the schema
+    pub initial_default: Option<Literal>,
+    /// Used to populate the field's value for any records written after the field was added to the schema, if the writer does not supply the field's value
+    pub write_default: Option<Literal>,
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "kebab-case")]
+struct SerdeNestedField {
+    pub id: i32,
+    pub name: String,
+    pub required: bool,
+    #[serde(rename = "type")]
+    pub field_type: Box<Type>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub doc: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub initial_default: Option<JsonValue>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub write_default: Option<JsonValue>,
+}
+
+impl From<SerdeNestedField> for NestedField {
+    fn from(value: SerdeNestedField) -> Self {
+        NestedField {
+            id: value.id,
+            name: value.name,
+            required: value.required,
+            initial_default: value.initial_default.and_then(|x| {
+                Literal::try_from_json(x, &value.field_type)
+                    .ok()
+                    .and_then(|x| x)
+            }),
+            write_default: value.write_default.and_then(|x| {
+                Literal::try_from_json(x, &value.field_type)
+                    .ok()
+                    .and_then(|x| x)
+            }),
+            field_type: value.field_type,
+            doc: value.doc,
+        }
+    }
+}
+
+impl From<NestedField> for SerdeNestedField {
+    fn from(value: NestedField) -> Self {
+        let initial_default = value
+            .initial_default
+            .and_then(|x| x.try_into_json(&value.field_type).ok());
+        let write_default = value
+            .write_default
+            .and_then(|x| x.try_into_json(&value.field_type).ok());
+        SerdeNestedField {
+            id: value.id,
+            name: value.name,
+            required: value.required,
+            field_type: value.field_type,
+            doc: value.doc,
+            initial_default,
+            write_default,
+        }
+    }
+}
+
+/// Reference to nested field.
+pub type NestedFieldRef = Arc<NestedField>;
+
+impl NestedField {
+    /// Construct a new field.
+    pub fn new(id: i32, name: impl ToString, field_type: Type, required: bool) -> Self {
+        Self {
+            id,
+            name: name.to_string(),
+            required,
+            field_type: Box::new(field_type),
+            doc: None,
+            initial_default: None,
+            write_default: None,
+        }
+    }
+
+    /// Construct a required field.
+    pub fn required(id: i32, name: impl ToString, field_type: Type) -> Self {
+        Self::new(id, name, field_type, true)
+    }
+
+    /// Construct an optional field.
+    pub fn optional(id: i32, name: impl ToString, field_type: Type) -> Self {
+        Self::new(id, name, field_type, false)
+    }
+
+    /// Construct list type's element field.
+    pub fn list_element(id: i32, field_type: Type, required: bool) -> Self {
+        Self::new(id, LIST_FIELD_NAME, field_type, required)
+    }
+
+    /// Construct map type's key field.
+    pub fn map_key_element(id: i32, field_type: Type) -> Self {
+        Self::required(id, MAP_KEY_FIELD_NAME, field_type)
+    }
+
+    /// Construct map type's value field.
+    pub fn map_value_element(id: i32, field_type: Type, required: bool) -> Self {
+        Self::new(id, MAP_VALUE_FIELD_NAME, field_type, required)
+    }
+
+    /// Set the field's doc.
+    pub fn with_doc(mut self, doc: impl ToString) -> Self {
+        self.doc = Some(doc.to_string());
+        self
+    }
+
+    /// Set the field's initial default value.
+    pub fn with_initial_default(mut self, value: Literal) -> Self {
+        self.initial_default = Some(value);
+        self
+    }
+
+    /// Set the field's initial default value.
+    pub fn with_write_default(mut self, value: Literal) -> Self {
+        self.write_default = Some(value);
+        self
+    }
+
+    /// Set the id of the field.
+    #[allow(unused)]
+    pub(crate) fn with_id(mut self, id: i32) -> Self {
+        self.id = id;
+        self
+    }
+}
+
+impl fmt::Display for NestedField {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}: ", self.id)?;
+        write!(f, "{}: ", self.name)?;
+        if self.required {
+            write!(f, "required ")?;
+        } else {
+            write!(f, "optional ")?;
+        }
+        write!(f, "{} ", self.field_type)?;
+        if let Some(doc) = &self.doc {
+            write!(f, "{}", doc)?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, PartialEq, Eq, Clone)]
+/// A list is a collection of values with some element type. The element field has an integer id that is unique in the table schema.
+/// Elements can be either optional or required. Element types may be any type.
+pub struct ListType {
+    /// Element field of list type.
+    pub element_field: NestedFieldRef,
+}
+
+impl ListType {
+    /// Construct a list type with the given element field.
+    pub fn new(element_field: NestedFieldRef) -> Self {
+        Self { element_field }
+    }
+}
+
+/// Module for type serialization/deserialization.
+pub(super) mod _serde {
+    use std::borrow::Cow;
+
+    use serde::{Deserialize, Serialize};
+
+    use crate::spec::datatypes::Type::Map;
+    use crate::spec::datatypes::{
+        ListType, MapType, NestedField, NestedFieldRef, PrimitiveType, StructType, Type,
+    };
+
+    /// List type for serialization and deserialization
+    #[derive(Serialize, Deserialize)]
+    #[serde(untagged)]
+    pub(super) enum SerdeType<'a> {
+        #[serde(rename_all = "kebab-case")]
+        List {
+            r#type: String,
+            element_id: i32,
+            element_required: bool,
+            element: Cow<'a, Type>,
+        },
+        Struct {
+            r#type: String,
+            fields: Cow<'a, [NestedFieldRef]>,
+        },
+        #[serde(rename_all = "kebab-case")]
+        Map {
+            r#type: String,
+            key_id: i32,
+            key: Cow<'a, Type>,
+            value_id: i32,
+            value_required: bool,
+            value: Cow<'a, Type>,
+        },
+        Primitive(PrimitiveType),
+    }
+
+    impl From<SerdeType<'_>> for Type {
+        fn from(value: SerdeType) -> Self {
+            match value {
+                SerdeType::List {
+                    r#type: _,
+                    element_id,
+                    element_required,
+                    element,
+                } => Self::List(ListType {
+                    element_field: NestedField::list_element(
+                        element_id,
+                        element.into_owned(),
+                        element_required,
+                    )
+                    .into(),
+                }),
+                SerdeType::Map {
+                    r#type: _,
+                    key_id,
+                    key,
+                    value_id,
+                    value_required,
+                    value,
+                } => Map(MapType {
+                    key_field: NestedField::map_key_element(key_id, key.into_owned()).into(),
+                    value_field: NestedField::map_value_element(
+                        value_id,
+                        value.into_owned(),
+                        value_required,
+                    )
+                    .into(),
+                }),
+                SerdeType::Struct { r#type: _, fields } => {
+                    Self::Struct(StructType::new(fields.into_owned()))
+                }
+                SerdeType::Primitive(p) => Self::Primitive(p),
+            }
+        }
+    }
+
+    impl<'a> From<&'a Type> for SerdeType<'a> {
+        fn from(value: &'a Type) -> Self {
+            match value {
+                Type::List(list) => SerdeType::List {
+                    r#type: "list".to_string(),
+                    element_id: list.element_field.id,
+                    element_required: list.element_field.required,
+                    element: Cow::Borrowed(&list.element_field.field_type),
+                },
+                Type::Map(map) => SerdeType::Map {
+                    r#type: "map".to_string(),
+                    key_id: map.key_field.id,
+                    key: Cow::Borrowed(&map.key_field.field_type),
+                    value_id: map.value_field.id,
+                    value_required: map.value_field.required,
+                    value: Cow::Borrowed(&map.value_field.field_type),
+                },
+                Type::Struct(s) => SerdeType::Struct {
+                    r#type: "struct".to_string(),
+                    fields: Cow::Borrowed(&s.fields),
+                },
+                Type::Primitive(p) => SerdeType::Primitive(p.clone()),
+            }
+        }
+    }
+}
+
+#[derive(Debug, PartialEq, Eq, Clone)]
+/// A map is a collection of key-value pairs with a key type and a value type.
+/// Both the key field and value field each have an integer id that is unique in the table schema.
+/// Map keys are required and map values can be either optional or required.
+/// Both map keys and map values may be any type, including nested types.
+pub struct MapType {
+    /// Field for key.
+    pub key_field: NestedFieldRef,
+    /// Field for value.
+    pub value_field: NestedFieldRef,
+}
+
+impl MapType {
+    /// Construct a map type with the given key and value fields.
+    pub fn new(key_field: NestedFieldRef, value_field: NestedFieldRef) -> Self {
+        Self {
+            key_field,
+            value_field,
+        }
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/manifest.rs b/crates/sail-iceberg/src/spec/manifest.rs
new file mode 100644
index 0000000000..e05324417b
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/manifest.rs
@@ -0,0 +1,447 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use serde::{Deserialize, Serialize};
+
+use super::partition::PartitionSpec;
+use super::schema::SchemaRef;
+use super::values::Literal;
+
+/// Reference to [`ManifestEntry`].
+pub type ManifestEntryRef = Arc<ManifestEntry>;
+
+/// A manifest contains metadata and a list of entries.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct Manifest {
+    /// Metadata about the manifest.
+    pub metadata: ManifestMetadata,
+    /// Entries in the manifest.
+    pub entries: Vec<ManifestEntryRef>,
+}
+
+impl Manifest {
+    /// Create a new manifest.
+    pub fn new(metadata: ManifestMetadata, entries: Vec<ManifestEntry>) -> Self {
+        Self {
+            metadata,
+            entries: entries.into_iter().map(Arc::new).collect(),
+        }
+    }
+
+    /// Get the entries in the manifest.
+    pub fn entries(&self) -> &[ManifestEntryRef] {
+        &self.entries
+    }
+
+    /// Get the metadata of the manifest.
+    pub fn metadata(&self) -> &ManifestMetadata {
+        &self.metadata
+    }
+
+    /// Consume this Manifest, returning its constituent parts
+    pub fn into_parts(self) -> (Vec<ManifestEntryRef>, ManifestMetadata) {
+        let Self { entries, metadata } = self;
+        (entries, metadata)
+    }
+}
+
+/// Metadata about a manifest file.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct ManifestMetadata {
+    /// The schema of the table when the manifest was written.
+    pub schema: SchemaRef,
+    /// The partition spec used to write the manifest.
+    pub partition_spec: PartitionSpec,
+    /// The format version of the manifest.
+    pub format_version: FormatVersion,
+}
+
+impl ManifestMetadata {
+    /// Create new manifest metadata.
+    pub fn new(
+        schema: SchemaRef,
+        partition_spec: PartitionSpec,
+        format_version: FormatVersion,
+    ) -> Self {
+        Self {
+            schema,
+            partition_spec,
+            format_version,
+        }
+    }
+}
+
+/// Format version of Iceberg.
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub enum FormatVersion {
+    /// Version 1
+    V1 = 1,
+    /// Version 2
+    V2 = 2,
+}
+
+impl serde::Serialize for FormatVersion {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.serialize_i32(*self as i32)
+    }
+}
+
+impl<'de> serde::Deserialize<'de> for FormatVersion {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let value = i32::deserialize(deserializer)?;
+        match value {
+            1 => Ok(FormatVersion::V1),
+            2 => Ok(FormatVersion::V2),
+            _ => Err(serde::de::Error::custom(format!(
+                "Invalid format version: {}",
+                value
+            ))),
+        }
+    }
+}
+
+impl Default for FormatVersion {
+    fn default() -> Self {
+        Self::V2
+    }
+}
+
+/// Status of a manifest entry.
+#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
+#[serde(rename_all = "UPPERCASE")]
+pub enum ManifestStatus {
+    /// The data file was added in this snapshot.
+    Added,
+    /// The data file exists in the table.
+    Existing,
+    /// The data file was deleted in this snapshot.
+    Deleted,
+}
+
+/// Content type of a data file.
+#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
+#[serde(rename_all = "UPPERCASE")]
+pub enum DataContentType {
+    /// The file contains data.
+    Data,
+    /// The file contains position deletes.
+    PositionDeletes,
+    /// The file contains equality deletes.
+    EqualityDeletes,
+}
+
+/// File format of a data file.
+#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
+#[serde(rename_all = "UPPERCASE")]
+pub enum DataFileFormat {
+    /// Avro format
+    Avro,
+    /// ORC format
+    Orc,
+    /// Parquet format
+    Parquet,
+    /// Puffin format (for delete files)
+    Puffin,
+}
+
+/// A manifest entry represents a data file in a manifest.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+pub struct ManifestEntry {
+    /// The status of the data file.
+    pub status: ManifestStatus,
+    /// The snapshot ID when the data file was added to the table.
+    pub snapshot_id: i64,
+    /// The sequence number when the data file was added to the table.
+    pub sequence_number: i64,
+    /// The data file.
+    pub data_file: DataFile,
+}
+
+impl ManifestEntry {
+    /// Create a new manifest entry.
+    pub fn new(
+        status: ManifestStatus,
+        snapshot_id: i64,
+        sequence_number: i64,
+        data_file: DataFile,
+    ) -> Self {
+        Self {
+            status,
+            snapshot_id,
+            sequence_number,
+            data_file,
+        }
+    }
+}
+
+/// A data file in Iceberg.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+pub struct DataFile {
+    /// Type of content stored by the data file.
+    pub content: DataContentType,
+    /// Full URI for the file with FS scheme.
+    pub file_path: String,
+    /// File format name.
+    pub file_format: DataFileFormat,
+    /// Partition data tuple.
+    pub partition: Vec<Option<Literal>>,
+    /// Number of records in this file.
+    pub record_count: u64,
+    /// Total file size in bytes.
+    pub file_size_in_bytes: u64,
+    /// Map from column id to the total size on disk of all regions that store the column.
+    #[serde(skip_serializing_if = "HashMap::is_empty")]
+    pub column_sizes: HashMap<i32, u64>,
+    /// Map from column id to number of values in the column.
+    #[serde(skip_serializing_if = "HashMap::is_empty")]
+    pub value_counts: HashMap<i32, u64>,
+    /// Map from column id to number of null values in the column.
+    #[serde(skip_serializing_if = "HashMap::is_empty")]
+    pub null_value_counts: HashMap<i32, u64>,
+    /// Map from column id to number of NaN values in the column.
+    #[serde(skip_serializing_if = "HashMap::is_empty")]
+    pub nan_value_counts: HashMap<i32, u64>,
+    /// Map from column id to lower bound in the column.
+    #[serde(skip_serializing_if = "HashMap::is_empty")]
+    pub lower_bounds: HashMap<i32, Literal>,
+    /// Map from column id to upper bound in the column.
+    #[serde(skip_serializing_if = "HashMap::is_empty")]
+    pub upper_bounds: HashMap<i32, Literal>,
+    /// Implementation-specific key metadata for encryption.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub key_metadata: Option<Vec<u8>>,
+    /// Split offsets for the data file.
+    #[serde(skip_serializing_if = "Vec::is_empty")]
+    pub split_offsets: Vec<i64>,
+    /// Field ids used to determine row equality in equality delete files.
+    #[serde(skip_serializing_if = "Vec::is_empty")]
+    pub equality_ids: Vec<i32>,
+    /// ID representing sort order for this file.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub sort_order_id: Option<i32>,
+    /// The partition spec id used when writing this data file.
+    pub partition_spec_id: i32,
+}
+
+impl DataFile {
+    /// Create a new data file builder.
+    pub fn builder() -> DataFileBuilder {
+        DataFileBuilder::new()
+    }
+
+    /// Get the content type of the data file.
+    pub fn content_type(&self) -> DataContentType {
+        self.content
+    }
+
+    /// Get the file path.
+    pub fn file_path(&self) -> &str {
+        &self.file_path
+    }
+
+    /// Get the file format.
+    pub fn file_format(&self) -> DataFileFormat {
+        self.file_format
+    }
+
+    /// Get the partition values.
+    pub fn partition(&self) -> &[Option<Literal>] {
+        &self.partition
+    }
+
+    /// Get the record count.
+    pub fn record_count(&self) -> u64 {
+        self.record_count
+    }
+
+    /// Get the file size in bytes.
+    pub fn file_size_in_bytes(&self) -> u64 {
+        self.file_size_in_bytes
+    }
+
+    /// Get column sizes.
+    pub fn column_sizes(&self) -> &HashMap<i32, u64> {
+        &self.column_sizes
+    }
+
+    /// Get value counts.
+    pub fn value_counts(&self) -> &HashMap<i32, u64> {
+        &self.value_counts
+    }
+
+    /// Get null value counts.
+    pub fn null_value_counts(&self) -> &HashMap<i32, u64> {
+        &self.null_value_counts
+    }
+
+    /// Get NaN value counts.
+    pub fn nan_value_counts(&self) -> &HashMap<i32, u64> {
+        &self.nan_value_counts
+    }
+
+    /// Get lower bounds.
+    pub fn lower_bounds(&self) -> &HashMap<i32, Literal> {
+        &self.lower_bounds
+    }
+
+    /// Get upper bounds.
+    pub fn upper_bounds(&self) -> &HashMap<i32, Literal> {
+        &self.upper_bounds
+    }
+}
+
+/// Builder for creating data files.
+#[derive(Debug)]
+pub struct DataFileBuilder {
+    content: DataContentType,
+    file_path: Option<String>,
+    file_format: DataFileFormat,
+    partition: Vec<Option<Literal>>,
+    record_count: u64,
+    file_size_in_bytes: u64,
+    column_sizes: HashMap<i32, u64>,
+    value_counts: HashMap<i32, u64>,
+    null_value_counts: HashMap<i32, u64>,
+    nan_value_counts: HashMap<i32, u64>,
+    lower_bounds: HashMap<i32, Literal>,
+    upper_bounds: HashMap<i32, Literal>,
+    key_metadata: Option<Vec<u8>>,
+    split_offsets: Vec<i64>,
+    equality_ids: Vec<i32>,
+    sort_order_id: Option<i32>,
+    partition_spec_id: i32,
+}
+
+impl DataFileBuilder {
+    /// Create a new data file builder.
+    pub fn new() -> Self {
+        Self {
+            content: DataContentType::Data,
+            file_path: None,
+            file_format: DataFileFormat::Parquet,
+            partition: Vec::new(),
+            record_count: 0,
+            file_size_in_bytes: 0,
+            column_sizes: HashMap::new(),
+            value_counts: HashMap::new(),
+            null_value_counts: HashMap::new(),
+            nan_value_counts: HashMap::new(),
+            lower_bounds: HashMap::new(),
+            upper_bounds: HashMap::new(),
+            key_metadata: None,
+            split_offsets: Vec::new(),
+            equality_ids: Vec::new(),
+            sort_order_id: None,
+            partition_spec_id: 0,
+        }
+    }
+
+    /// Set the content type.
+    pub fn with_content(mut self, content: DataContentType) -> Self {
+        self.content = content;
+        self
+    }
+
+    /// Set the file path.
+    pub fn with_file_path(mut self, file_path: impl ToString) -> Self {
+        self.file_path = Some(file_path.to_string());
+        self
+    }
+
+    /// Set the file format.
+    pub fn with_file_format(mut self, file_format: DataFileFormat) -> Self {
+        self.file_format = file_format;
+        self
+    }
+
+    /// Set the partition values.
+    pub fn with_partition(mut self, partition: Vec<Option<Literal>>) -> Self {
+        self.partition = partition;
+        self
+    }
+
+    /// Set the record count.
+    pub fn with_record_count(mut self, record_count: u64) -> Self {
+        self.record_count = record_count;
+        self
+    }
+
+    /// Set the file size in bytes.
+    pub fn with_file_size_in_bytes(mut self, file_size_in_bytes: u64) -> Self {
+        self.file_size_in_bytes = file_size_in_bytes;
+        self
+    }
+
+    /// Set the partition spec id.
+    pub fn with_partition_spec_id(mut self, partition_spec_id: i32) -> Self {
+        self.partition_spec_id = partition_spec_id;
+        self
+    }
+
+    /// Add column size.
+    pub fn with_column_size(mut self, column_id: i32, size: u64) -> Self {
+        self.column_sizes.insert(column_id, size);
+        self
+    }
+
+    /// Add value count.
+    pub fn with_value_count(mut self, column_id: i32, count: u64) -> Self {
+        self.value_counts.insert(column_id, count);
+        self
+    }
+
+    /// Add null value count.
+    pub fn with_null_value_count(mut self, column_id: i32, count: u64) -> Self {
+        self.null_value_counts.insert(column_id, count);
+        self
+    }
+
+    /// Add lower bound.
+    pub fn with_lower_bound(mut self, column_id: i32, bound: Literal) -> Self {
+        self.lower_bounds.insert(column_id, bound);
+        self
+    }
+
+    /// Add upper bound.
+    pub fn with_upper_bound(mut self, column_id: i32, bound: Literal) -> Self {
+        self.upper_bounds.insert(column_id, bound);
+        self
+    }
+
+    /// Build the data file.
+    pub fn build(self) -> Result<DataFile, String> {
+        let file_path = self.file_path.ok_or("file_path is required")?;
+
+        Ok(DataFile {
+            content: self.content,
+            file_path,
+            file_format: self.file_format,
+            partition: self.partition,
+            record_count: self.record_count,
+            file_size_in_bytes: self.file_size_in_bytes,
+            column_sizes: self.column_sizes,
+            value_counts: self.value_counts,
+            null_value_counts: self.null_value_counts,
+            nan_value_counts: self.nan_value_counts,
+            lower_bounds: self.lower_bounds,
+            upper_bounds: self.upper_bounds,
+            key_metadata: self.key_metadata,
+            split_offsets: self.split_offsets,
+            equality_ids: self.equality_ids,
+            sort_order_id: self.sort_order_id,
+            partition_spec_id: self.partition_spec_id,
+        })
+    }
+}
+
+impl Default for DataFileBuilder {
+    fn default() -> Self {
+        Self::new()
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/manifest_list.rs b/crates/sail-iceberg/src/spec/manifest_list.rs
new file mode 100644
index 0000000000..b7559d460a
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/manifest_list.rs
@@ -0,0 +1,308 @@
+use serde::{Deserialize, Serialize};
+
+use super::values::Literal;
+
+pub const UNASSIGNED_SEQUENCE_NUMBER: i64 = -1;
+
+/// Snapshots are embedded in table metadata, but the list of manifests for a
+/// snapshot are stored in a separate manifest list file.
+///
+/// A new manifest list is written for each attempt to commit a snapshot
+/// because the list of manifests always changes to produce a new snapshot.
+/// When a manifest list is written, the (optimistic) sequence number of the
+/// snapshot is written for all new manifest files tracked by the list.
+///
+/// A manifest list includes summary metadata that can be used to avoid
+/// scanning all of the manifests in a snapshot when planning a table scan.
+/// This includes the number of added, existing, and deleted files, and a
+/// summary of values for each field of the partition spec used to write the
+/// manifest.
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub struct ManifestList {
+    /// Entries in a manifest list.
+    pub entries: Vec<ManifestFile>,
+}
+
+impl ManifestList {
+    /// Create a new manifest list.
+    pub fn new(entries: Vec<ManifestFile>) -> Self {
+        Self { entries }
+    }
+
+    /// Get the entries in the manifest list.
+    pub fn entries(&self) -> &[ManifestFile] {
+        &self.entries
+    }
+
+    /// Take ownership of the entries in the manifest list, consuming it
+    pub fn into_entries(self) -> Vec<ManifestFile> {
+        self.entries
+    }
+}
+
+/// Status of a manifest file in a manifest list.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+pub enum ManifestFileStatus {
+    /// The manifest file was added in this snapshot.
+    Added,
+    /// The manifest file was inherited from the parent snapshot.
+    Existing,
+    /// The manifest file was deleted in this snapshot.
+    Deleted,
+}
+
+/// Content type of a manifest file.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+pub enum ManifestContentType {
+    /// The manifest contains data files.
+    Data,
+    /// The manifest contains delete files.
+    Deletes,
+}
+
+/// A manifest file in a manifest list.
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub struct ManifestFile {
+    /// The path to the manifest file.
+    pub manifest_path: String,
+    /// The length of the manifest file in bytes.
+    pub manifest_length: i64,
+    /// The ID of the partition spec used to write the manifest.
+    pub partition_spec_id: i32,
+    /// The content type of the manifest file.
+    pub content: ManifestContentType,
+    /// The sequence number when the manifest was added to the table.
+    pub sequence_number: i64,
+    /// The minimum sequence number of all data files in the manifest.
+    pub min_sequence_number: i64,
+    /// The snapshot ID when the manifest was added to the table.
+    pub added_snapshot_id: i64,
+    /// The number of files added in this manifest.
+    pub added_files_count: i32,
+    /// The number of existing files in this manifest.
+    pub existing_files_count: i32,
+    /// The number of deleted files in this manifest.
+    pub deleted_files_count: i32,
+    /// The number of rows added in this manifest.
+    pub added_rows_count: i64,
+    /// The number of existing rows in this manifest.
+    pub existing_rows_count: i64,
+    /// The number of deleted rows in this manifest.
+    pub deleted_rows_count: i64,
+    /// A list of field summaries for each partition field in the spec.
+    /// Each field in the list corresponds to a field in the manifest file's partition spec.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub partitions: Option<Vec<FieldSummary>>,
+    /// Implementation-specific key metadata for encryption.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub key_metadata: Option<Vec<u8>>,
+}
+
+impl ManifestFile {
+    /// Create a new manifest file builder.
+    pub fn builder() -> ManifestFileBuilder {
+        ManifestFileBuilder::new()
+    }
+
+    /// Get the total number of files in this manifest.
+    pub fn total_files_count(&self) -> i32 {
+        self.added_files_count + self.existing_files_count + self.deleted_files_count
+    }
+
+    /// Get the total number of rows in this manifest.
+    pub fn total_rows_count(&self) -> i64 {
+        self.added_rows_count + self.existing_rows_count + self.deleted_rows_count
+    }
+}
+
+/// Field summary for partition fields in a manifest file.
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub struct FieldSummary {
+    /// Whether the partition field contains null values.
+    pub contains_null: bool,
+    /// Whether the partition field contains NaN values (only for float and double).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub contains_nan: Option<bool>,
+    /// The minimum value of the partition field.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub lower_bound: Option<Literal>,
+    /// The maximum value of the partition field.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub upper_bound: Option<Literal>,
+}
+
+impl FieldSummary {
+    /// Create a new field summary.
+    pub fn new(contains_null: bool) -> Self {
+        Self {
+            contains_null,
+            contains_nan: None,
+            lower_bound: None,
+            upper_bound: None,
+        }
+    }
+
+    /// Set whether the field contains NaN values.
+    pub fn with_contains_nan(mut self, contains_nan: bool) -> Self {
+        self.contains_nan = Some(contains_nan);
+        self
+    }
+
+    /// Set the lower bound of the field.
+    pub fn with_lower_bound(mut self, lower_bound: Literal) -> Self {
+        self.lower_bound = Some(lower_bound);
+        self
+    }
+
+    /// Set the upper bound of the field.
+    pub fn with_upper_bound(mut self, upper_bound: Literal) -> Self {
+        self.upper_bound = Some(upper_bound);
+        self
+    }
+}
+
+/// Builder for creating manifest files.
+#[derive(Debug)]
+pub struct ManifestFileBuilder {
+    manifest_path: Option<String>,
+    manifest_length: i64,
+    partition_spec_id: i32,
+    content: ManifestContentType,
+    sequence_number: i64,
+    min_sequence_number: i64,
+    added_snapshot_id: i64,
+    added_files_count: i32,
+    existing_files_count: i32,
+    deleted_files_count: i32,
+    added_rows_count: i64,
+    existing_rows_count: i64,
+    deleted_rows_count: i64,
+    partitions: Option<Vec<FieldSummary>>,
+    key_metadata: Option<Vec<u8>>,
+}
+
+impl ManifestFileBuilder {
+    /// Create a new manifest file builder.
+    pub fn new() -> Self {
+        Self {
+            manifest_path: None,
+            manifest_length: 0,
+            partition_spec_id: 0,
+            content: ManifestContentType::Data,
+            sequence_number: UNASSIGNED_SEQUENCE_NUMBER,
+            min_sequence_number: UNASSIGNED_SEQUENCE_NUMBER,
+            added_snapshot_id: 0,
+            added_files_count: 0,
+            existing_files_count: 0,
+            deleted_files_count: 0,
+            added_rows_count: 0,
+            existing_rows_count: 0,
+            deleted_rows_count: 0,
+            partitions: None,
+            key_metadata: None,
+        }
+    }
+
+    /// Set the manifest path.
+    pub fn with_manifest_path(mut self, manifest_path: impl ToString) -> Self {
+        self.manifest_path = Some(manifest_path.to_string());
+        self
+    }
+
+    /// Set the manifest length.
+    pub fn with_manifest_length(mut self, manifest_length: i64) -> Self {
+        self.manifest_length = manifest_length;
+        self
+    }
+
+    /// Set the partition spec id.
+    pub fn with_partition_spec_id(mut self, partition_spec_id: i32) -> Self {
+        self.partition_spec_id = partition_spec_id;
+        self
+    }
+
+    /// Set the content type.
+    pub fn with_content(mut self, content: ManifestContentType) -> Self {
+        self.content = content;
+        self
+    }
+
+    /// Set the sequence number.
+    pub fn with_sequence_number(mut self, sequence_number: i64) -> Self {
+        self.sequence_number = sequence_number;
+        self
+    }
+
+    /// Set the minimum sequence number.
+    pub fn with_min_sequence_number(mut self, min_sequence_number: i64) -> Self {
+        self.min_sequence_number = min_sequence_number;
+        self
+    }
+
+    /// Set the added snapshot id.
+    pub fn with_added_snapshot_id(mut self, added_snapshot_id: i64) -> Self {
+        self.added_snapshot_id = added_snapshot_id;
+        self
+    }
+
+    /// Set the file counts.
+    pub fn with_file_counts(mut self, added: i32, existing: i32, deleted: i32) -> Self {
+        self.added_files_count = added;
+        self.existing_files_count = existing;
+        self.deleted_files_count = deleted;
+        self
+    }
+
+    /// Set the row counts.
+    pub fn with_row_counts(mut self, added: i64, existing: i64, deleted: i64) -> Self {
+        self.added_rows_count = added;
+        self.existing_rows_count = existing;
+        self.deleted_rows_count = deleted;
+        self
+    }
+
+    /// Set the partitions.
+    pub fn with_partitions(mut self, partitions: Vec<FieldSummary>) -> Self {
+        self.partitions = Some(partitions);
+        self
+    }
+
+    /// Set the key metadata.
+    pub fn with_key_metadata(mut self, key_metadata: Vec<u8>) -> Self {
+        self.key_metadata = Some(key_metadata);
+        self
+    }
+
+    /// Build the manifest file.
+    pub fn build(self) -> Result<ManifestFile, String> {
+        let manifest_path = self.manifest_path.ok_or("manifest_path is required")?;
+
+        Ok(ManifestFile {
+            manifest_path,
+            manifest_length: self.manifest_length,
+            partition_spec_id: self.partition_spec_id,
+            content: self.content,
+            sequence_number: self.sequence_number,
+            min_sequence_number: self.min_sequence_number,
+            added_snapshot_id: self.added_snapshot_id,
+            added_files_count: self.added_files_count,
+            existing_files_count: self.existing_files_count,
+            deleted_files_count: self.deleted_files_count,
+            added_rows_count: self.added_rows_count,
+            existing_rows_count: self.existing_rows_count,
+            deleted_rows_count: self.deleted_rows_count,
+            partitions: self.partitions,
+            key_metadata: self.key_metadata,
+        })
+    }
+}
+
+impl Default for ManifestFileBuilder {
+    fn default() -> Self {
+        Self::new()
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/mod.rs b/crates/sail-iceberg/src/spec/mod.rs
new file mode 100644
index 0000000000..ded3e76444
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/mod.rs
@@ -0,0 +1,19 @@
+pub mod datatypes;
+pub mod manifest;
+pub mod manifest_list;
+pub mod partition;
+pub mod schema;
+pub mod snapshot;
+pub mod table_metadata;
+pub mod transform;
+pub mod values;
+
+pub use datatypes::*;
+pub use manifest::*;
+pub use manifest_list::*;
+pub use partition::*;
+pub use schema::*;
+pub use snapshot::*;
+pub use table_metadata::*;
+pub use transform::*;
+pub use values::*;
diff --git a/crates/sail-iceberg/src/spec/partition.rs b/crates/sail-iceberg/src/spec/partition.rs
new file mode 100644
index 0000000000..fcf0c8a894
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/partition.rs
@@ -0,0 +1,211 @@
+use std::sync::Arc;
+
+use serde::{Deserialize, Serialize};
+
+use super::datatypes::{NestedField, StructType};
+use super::schema::Schema;
+use super::transform::Transform;
+
+#[allow(unused)]
+pub(crate) const UNPARTITIONED_LAST_ASSIGNED_ID: i32 = 999;
+pub(crate) const DEFAULT_PARTITION_SPEC_ID: i32 = 0;
+
+/// Partition fields capture the transform from table data to partition values.
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "kebab-case")]
+pub struct PartitionField {
+    /// A source column id from the table's schema
+    pub source_id: i32,
+    /// A partition field id that is used to identify a partition field and is unique within a partition spec.
+    /// In v2 table metadata, it is unique across all partition specs.
+    pub field_id: i32,
+    /// A partition name.
+    pub name: String,
+    /// A transform that is applied to the source column to produce a partition value.
+    pub transform: Transform,
+}
+
+impl PartitionField {
+    /// Create a new partition field.
+    pub fn new(source_id: i32, field_id: i32, name: impl ToString, transform: Transform) -> Self {
+        Self {
+            source_id,
+            field_id,
+            name: name.to_string(),
+            transform,
+        }
+    }
+}
+
+/// Reference to [`PartitionSpec`].
+pub type PartitionSpecRef = Arc<PartitionSpec>;
+
+/// Partition spec that defines how to produce a tuple of partition values from a record.
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "kebab-case")]
+pub struct PartitionSpec {
+    /// Identifier for PartitionSpec
+    spec_id: i32,
+    /// Details of the partition spec
+    fields: Vec<PartitionField>,
+}
+
+impl PartitionSpec {
+    /// Create a new partition spec builder.
+    pub fn builder() -> PartitionSpecBuilder {
+        PartitionSpecBuilder::new()
+    }
+
+    /// Fields of the partition spec
+    pub fn fields(&self) -> &[PartitionField] {
+        &self.fields
+    }
+
+    /// Spec id of the partition spec
+    pub fn spec_id(&self) -> i32 {
+        self.spec_id
+    }
+
+    /// Get a new unpartitioned partition spec
+    pub fn unpartitioned_spec() -> Self {
+        Self {
+            spec_id: DEFAULT_PARTITION_SPEC_ID,
+            fields: vec![],
+        }
+    }
+
+    /// Returns if the partition spec is unpartitioned.
+    ///
+    /// A [`PartitionSpec`] is unpartitioned if it has no fields or all fields are [`Transform::Void`] transform.
+    pub fn is_unpartitioned(&self) -> bool {
+        self.fields.is_empty() || self.fields.iter().all(|f| f.transform == Transform::Void)
+    }
+
+    /// Returns the partition type of this partition spec.
+    pub fn partition_type(&self, schema: &Schema) -> Result<StructType, String> {
+        let mut partition_fields = Vec::new();
+
+        for partition_field in self.fields.iter() {
+            let source_field = schema
+                .field_by_id(partition_field.source_id)
+                .ok_or_else(|| {
+                    format!(
+                        "Cannot find source field with id {}",
+                        partition_field.source_id
+                    )
+                })?;
+
+            let result_type = partition_field
+                .transform
+                .result_type(&source_field.field_type)?;
+
+            let nested_field = NestedField::new(
+                partition_field.field_id,
+                &partition_field.name,
+                result_type,
+                false, // Partition fields are typically optional
+            );
+
+            partition_fields.push(Arc::new(nested_field));
+        }
+
+        Ok(StructType::new(partition_fields))
+    }
+
+    /// Change the spec id of the partition spec
+    pub fn with_spec_id(self, spec_id: i32) -> Self {
+        Self { spec_id, ..self }
+    }
+
+    /// Get the highest field id in the partition spec.
+    pub fn highest_field_id(&self) -> Option<i32> {
+        self.fields.iter().map(|f| f.field_id).max()
+    }
+
+    /// Check if this partition spec is compatible with another partition spec.
+    ///
+    /// Returns true if the partition spec is equal to the other spec with partition field ids ignored and
+    /// spec_id ignored. The following must be identical:
+    /// * The number of fields
+    /// * Field order
+    /// * Field names
+    /// * Source column ids
+    /// * Transforms
+    pub fn is_compatible_with(&self, other: &PartitionSpec) -> bool {
+        if self.fields.len() != other.fields.len() {
+            return false;
+        }
+
+        for (this_field, other_field) in self.fields.iter().zip(other.fields.iter()) {
+            if this_field.source_id != other_field.source_id
+                || this_field.name != other_field.name
+                || this_field.transform != other_field.transform
+            {
+                return false;
+            }
+        }
+
+        true
+    }
+}
+
+/// Builder for partition spec.
+#[derive(Debug)]
+pub struct PartitionSpecBuilder {
+    spec_id: i32,
+    fields: Vec<PartitionField>,
+    next_field_id: i32,
+}
+
+impl PartitionSpecBuilder {
+    /// Create a new partition spec builder.
+    pub fn new() -> Self {
+        Self {
+            spec_id: DEFAULT_PARTITION_SPEC_ID,
+            fields: Vec::new(),
+            next_field_id: 1000, // Partition field IDs typically start from 1000
+        }
+    }
+
+    /// Set the spec id.
+    pub fn with_spec_id(mut self, spec_id: i32) -> Self {
+        self.spec_id = spec_id;
+        self
+    }
+
+    /// Add a partition field.
+    pub fn add_field(mut self, source_id: i32, name: impl ToString, transform: Transform) -> Self {
+        let field = PartitionField::new(source_id, self.next_field_id, name, transform);
+        self.fields.push(field);
+        self.next_field_id += 1;
+        self
+    }
+
+    /// Add a partition field with explicit field id.
+    pub fn add_field_with_id(
+        mut self,
+        source_id: i32,
+        field_id: i32,
+        name: impl ToString,
+        transform: Transform,
+    ) -> Self {
+        let field = PartitionField::new(source_id, field_id, name, transform);
+        self.fields.push(field);
+        self.next_field_id = self.next_field_id.max(field_id + 1);
+        self
+    }
+
+    /// Build the partition spec.
+    pub fn build(self) -> PartitionSpec {
+        PartitionSpec {
+            spec_id: self.spec_id,
+            fields: self.fields,
+        }
+    }
+}
+
+impl Default for PartitionSpecBuilder {
+    fn default() -> Self {
+        Self::new()
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/schema.rs b/crates/sail-iceberg/src/spec/schema.rs
new file mode 100644
index 0000000000..f3651d7896
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/schema.rs
@@ -0,0 +1,377 @@
+use std::collections::{HashMap, HashSet};
+use std::fmt::{Display, Formatter};
+use std::sync::Arc;
+
+use serde::{Deserialize, Serialize};
+
+use super::datatypes::{NestedFieldRef, PrimitiveType, StructType, Type};
+
+/// Type alias for schema id.
+pub type SchemaId = i32;
+/// Reference to [`Schema`].
+pub type SchemaRef = Arc<Schema>;
+/// Default schema id.
+pub const DEFAULT_SCHEMA_ID: SchemaId = 0;
+
+/// Defines schema in iceberg.
+#[derive(Debug, Serialize, Clone)]
+pub struct Schema {
+    #[serde(rename = "type")]
+    schema_type: String,
+    #[serde(rename = "schema-id")]
+    schema_id: SchemaId,
+    #[serde(rename = "fields")]
+    fields: Vec<NestedFieldRef>,
+    #[serde(
+        rename = "identifier-field-ids",
+        skip_serializing_if = "Option::is_none"
+    )]
+    identifier_field_ids: Option<Vec<i32>>,
+
+    // Internal indexes (not serialized)
+    #[serde(skip)]
+    struct_type: StructType,
+    #[serde(skip)]
+    highest_field_id: i32,
+    #[serde(skip)]
+    id_to_field: HashMap<i32, NestedFieldRef>,
+    #[serde(skip)]
+    name_to_id: HashMap<String, i32>,
+    #[serde(skip)]
+    id_to_name: HashMap<i32, String>,
+}
+
+impl PartialEq for Schema {
+    fn eq(&self, other: &Self) -> bool {
+        self.schema_id == other.schema_id
+            && self.fields == other.fields
+            && self.identifier_field_ids == other.identifier_field_ids
+    }
+}
+
+impl Eq for Schema {}
+
+#[derive(Deserialize)]
+struct SchemaData {
+    #[serde(rename = "type")]
+    schema_type: String,
+    #[serde(rename = "schema-id")]
+    schema_id: SchemaId,
+    #[serde(rename = "fields")]
+    fields: Vec<NestedFieldRef>,
+    #[serde(rename = "identifier-field-ids")]
+    identifier_field_ids: Option<Vec<i32>>,
+}
+
+impl<'de> Deserialize<'de> for Schema {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let data = SchemaData::deserialize(deserializer)?;
+
+        let struct_type = StructType::new(data.fields.clone());
+        let mut id_to_field = HashMap::new();
+        SchemaBuilder::index_fields_recursive(struct_type.fields(), &mut id_to_field);
+
+        let mut name_to_id = HashMap::new();
+        let mut id_to_name = HashMap::new();
+        SchemaBuilder::index_names_recursive(
+            struct_type.fields(),
+            "",
+            &mut name_to_id,
+            &mut id_to_name,
+        );
+
+        let highest_field_id = id_to_field.keys().max().cloned().unwrap_or(0);
+
+        Ok(Schema {
+            schema_type: data.schema_type,
+            schema_id: data.schema_id,
+            fields: data.fields,
+            identifier_field_ids: data.identifier_field_ids,
+            struct_type,
+            highest_field_id,
+            id_to_field,
+            name_to_id,
+            id_to_name,
+        })
+    }
+}
+
+/// Schema builder.
+#[derive(Debug)]
+pub struct SchemaBuilder {
+    schema_id: i32,
+    fields: Vec<NestedFieldRef>,
+    identifier_field_ids: HashSet<i32>,
+}
+
+impl SchemaBuilder {
+    /// Add fields to schema builder.
+    pub fn with_fields(mut self, fields: impl IntoIterator<Item = NestedFieldRef>) -> Self {
+        self.fields.extend(fields);
+        self
+    }
+
+    /// Set schema id.
+    pub fn with_schema_id(mut self, schema_id: i32) -> Self {
+        self.schema_id = schema_id;
+        self
+    }
+
+    /// Set identifier field ids.
+    pub fn with_identifier_field_ids(mut self, ids: impl IntoIterator<Item = i32>) -> Self {
+        self.identifier_field_ids.extend(ids);
+        self
+    }
+
+    /// Builds the schema.
+    pub fn build(self) -> Result<Schema, String> {
+        let struct_type = StructType::new(self.fields.clone());
+        let id_to_field = self.build_id_to_field_index(&struct_type);
+
+        self.validate_identifier_ids(&id_to_field)?;
+
+        let (name_to_id, id_to_name) = self.build_name_indexes(&struct_type);
+        let highest_field_id = id_to_field.keys().max().cloned().unwrap_or(0);
+
+        let identifier_field_ids = if self.identifier_field_ids.is_empty() {
+            None
+        } else {
+            Some(self.identifier_field_ids.into_iter().collect())
+        };
+
+        Ok(Schema {
+            schema_type: "struct".to_string(),
+            schema_id: self.schema_id,
+            fields: self.fields,
+            identifier_field_ids,
+            struct_type,
+            highest_field_id,
+            id_to_field,
+            name_to_id,
+            id_to_name,
+        })
+    }
+
+    fn build_id_to_field_index(&self, struct_type: &StructType) -> HashMap<i32, NestedFieldRef> {
+        let mut id_to_field = HashMap::new();
+        Self::index_fields_recursive(struct_type.fields(), &mut id_to_field);
+        id_to_field
+    }
+
+    fn index_fields_recursive(
+        fields: &[NestedFieldRef],
+        id_to_field: &mut HashMap<i32, NestedFieldRef>,
+    ) {
+        for field in fields {
+            id_to_field.insert(field.id, field.clone());
+
+            match field.field_type.as_ref() {
+                Type::Struct(struct_type) => {
+                    Self::index_fields_recursive(struct_type.fields(), id_to_field);
+                }
+                Type::List(list_type) => {
+                    id_to_field.insert(list_type.element_field.id, list_type.element_field.clone());
+                    if let Type::Struct(struct_type) = list_type.element_field.field_type.as_ref() {
+                        Self::index_fields_recursive(struct_type.fields(), id_to_field);
+                    }
+                }
+                Type::Map(map_type) => {
+                    id_to_field.insert(map_type.key_field.id, map_type.key_field.clone());
+                    id_to_field.insert(map_type.value_field.id, map_type.value_field.clone());
+                    if let Type::Struct(struct_type) = map_type.key_field.field_type.as_ref() {
+                        Self::index_fields_recursive(struct_type.fields(), id_to_field);
+                    }
+                    if let Type::Struct(struct_type) = map_type.value_field.field_type.as_ref() {
+                        Self::index_fields_recursive(struct_type.fields(), id_to_field);
+                    }
+                }
+                _ => {}
+            }
+        }
+    }
+
+    fn build_name_indexes(
+        &self,
+        struct_type: &StructType,
+    ) -> (HashMap<String, i32>, HashMap<i32, String>) {
+        let mut name_to_id = HashMap::new();
+        let mut id_to_name = HashMap::new();
+        Self::index_names_recursive(struct_type.fields(), "", &mut name_to_id, &mut id_to_name);
+        (name_to_id, id_to_name)
+    }
+
+    fn index_names_recursive(
+        fields: &[NestedFieldRef],
+        prefix: &str,
+        name_to_id: &mut HashMap<String, i32>,
+        id_to_name: &mut HashMap<i32, String>,
+    ) {
+        for field in fields {
+            let full_name = if prefix.is_empty() {
+                field.name.clone()
+            } else {
+                format!("{}.{}", prefix, field.name)
+            };
+
+            name_to_id.insert(full_name.clone(), field.id);
+            id_to_name.insert(field.id, full_name.clone());
+
+            match field.field_type.as_ref() {
+                Type::Struct(struct_type) => {
+                    Self::index_names_recursive(
+                        struct_type.fields(),
+                        &full_name,
+                        name_to_id,
+                        id_to_name,
+                    );
+                }
+                Type::List(list_type) => {
+                    let element_name = format!("{}.element", full_name);
+                    name_to_id.insert(element_name.clone(), list_type.element_field.id);
+                    id_to_name.insert(list_type.element_field.id, element_name);
+
+                    if let Type::Struct(struct_type) = list_type.element_field.field_type.as_ref() {
+                        Self::index_names_recursive(
+                            struct_type.fields(),
+                            &full_name,
+                            name_to_id,
+                            id_to_name,
+                        );
+                    }
+                }
+                Type::Map(map_type) => {
+                    let key_name = format!("{}.key", full_name);
+                    let value_name = format!("{}.value", full_name);
+
+                    name_to_id.insert(key_name.clone(), map_type.key_field.id);
+                    id_to_name.insert(map_type.key_field.id, key_name);
+
+                    name_to_id.insert(value_name.clone(), map_type.value_field.id);
+                    id_to_name.insert(map_type.value_field.id, value_name.clone());
+
+                    if let Type::Struct(struct_type) = map_type.value_field.field_type.as_ref() {
+                        Self::index_names_recursive(
+                            struct_type.fields(),
+                            &value_name,
+                            name_to_id,
+                            id_to_name,
+                        );
+                    }
+                }
+                _ => {}
+            }
+        }
+    }
+
+    fn validate_identifier_ids(
+        &self,
+        id_to_field: &HashMap<i32, NestedFieldRef>,
+    ) -> Result<(), String> {
+        for identifier_field_id in &self.identifier_field_ids {
+            let field = id_to_field.get(identifier_field_id).ok_or_else(|| {
+                format!("Cannot add identifier field {identifier_field_id}: field does not exist")
+            })?;
+
+            if !field.required {
+                return Err(format!(
+                    "Cannot add identifier field: {} is an optional field",
+                    field.name
+                ));
+            }
+
+            if let Type::Primitive(p) = field.field_type.as_ref() {
+                if matches!(p, PrimitiveType::Double | PrimitiveType::Float) {
+                    return Err(format!(
+                        "Cannot add identifier field {}: cannot be a float or double type",
+                        field.name
+                    ));
+                }
+            } else {
+                return Err(format!(
+                    "Cannot add field {} as an identifier field: not a primitive type field",
+                    field.name
+                ));
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl Schema {
+    /// Create a schema builder.
+    pub fn builder() -> SchemaBuilder {
+        SchemaBuilder {
+            schema_id: DEFAULT_SCHEMA_ID,
+            fields: vec![],
+            identifier_field_ids: HashSet::default(),
+        }
+    }
+
+    /// Get field by field id.
+    pub fn field_by_id(&self, field_id: i32) -> Option<&NestedFieldRef> {
+        self.id_to_field.get(&field_id)
+    }
+
+    /// Get field by field name.
+    pub fn field_by_name(&self, field_name: &str) -> Option<&NestedFieldRef> {
+        self.name_to_id
+            .get(field_name)
+            .and_then(|id| self.field_by_id(*id))
+    }
+
+    /// Returns [`highest_field_id`].
+    #[inline]
+    pub fn highest_field_id(&self) -> i32 {
+        self.highest_field_id
+    }
+
+    /// Returns [`schema_id`].
+    #[inline]
+    pub fn schema_id(&self) -> SchemaId {
+        self.schema_id
+    }
+
+    /// Returns the struct type representation of this schema.
+    pub fn as_struct(&self) -> &StructType {
+        &self.struct_type
+    }
+
+    /// Returns [`identifier_field_ids`].
+    pub fn identifier_field_ids(&self) -> impl ExactSizeIterator<Item = i32> + '_ {
+        self.identifier_field_ids
+            .as_ref()
+            .map(|ids| ids.iter().copied())
+            .unwrap_or_else(|| [].iter().copied())
+            .collect::<Vec<_>>()
+            .into_iter()
+    }
+
+    /// Get field id by full name.
+    pub fn field_id_by_name(&self, name: &str) -> Option<i32> {
+        self.name_to_id.get(name).copied()
+    }
+
+    /// Get full name by field id.
+    pub fn name_by_field_id(&self, field_id: i32) -> Option<&str> {
+        self.id_to_name.get(&field_id).map(String::as_str)
+    }
+
+    /// Get all fields in the schema.
+    pub fn fields(&self) -> &[NestedFieldRef] {
+        &self.fields
+    }
+}
+
+impl Display for Schema {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "table {{")?;
+        for field in &self.fields {
+            writeln!(f, "  {}", field)?;
+        }
+        writeln!(f, "}}")
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/snapshot.rs b/crates/sail-iceberg/src/spec/snapshot.rs
new file mode 100644
index 0000000000..237cf1a21f
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/snapshot.rs
@@ -0,0 +1,249 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use chrono::{DateTime, Utc};
+use serde::{Deserialize, Serialize};
+
+use super::schema::SchemaId;
+
+/// The ref name of the main branch of the table.
+pub const MAIN_BRANCH: &str = "main";
+pub const UNASSIGNED_SNAPSHOT_ID: i64 = -1;
+
+/// Reference to [`Snapshot`].
+pub type SnapshotRef = Arc<Snapshot>;
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "lowercase")]
+/// The operation field is used by some operations, like snapshot expiration, to skip processing certain snapshots.
+pub enum Operation {
+    /// Only data files were added and no files were removed.
+    Append,
+    /// Data and delete files were added and removed without changing table data;
+    /// i.e., compaction, changing the data file format, or relocating data files.
+    Replace,
+    /// Data and delete files were added and removed in a logical overwrite operation.
+    Overwrite,
+    /// Data files were removed and their contents logically deleted and/or delete files were added to delete rows.
+    Delete,
+}
+
+impl Operation {
+    /// Returns the string representation (lowercase) of the operation.
+    pub fn as_str(&self) -> &str {
+        match self {
+            Operation::Append => "append",
+            Operation::Replace => "replace",
+            Operation::Overwrite => "overwrite",
+            Operation::Delete => "delete",
+        }
+    }
+}
+
+impl Default for Operation {
+    fn default() -> Operation {
+        Self::Append
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+/// Summarises the changes in the snapshot.
+pub struct Summary {
+    /// The type of operation in the snapshot
+    pub operation: Operation,
+    /// Other summary data.
+    #[serde(flatten)]
+    pub additional_properties: HashMap<String, String>,
+}
+
+impl Summary {
+    /// Create a new summary with the given operation.
+    pub fn new(operation: Operation) -> Self {
+        Self {
+            operation,
+            additional_properties: HashMap::new(),
+        }
+    }
+
+    /// Add additional property to the summary.
+    pub fn with_property(mut self, key: impl ToString, value: impl ToString) -> Self {
+        self.additional_properties
+            .insert(key.to_string(), value.to_string());
+        self
+    }
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+/// A snapshot represents the state of a table at some time and is used to access the complete set of data files in the table.
+pub struct Snapshot {
+    /// A unique long ID
+    pub snapshot_id: i64,
+    /// The snapshot ID of the snapshot's parent.
+    /// Omitted for any snapshot with no parent
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub parent_snapshot_id: Option<i64>,
+    /// A monotonically increasing long that tracks the order of
+    /// changes to a table.
+    pub sequence_number: i64,
+    /// A timestamp when the snapshot was created, used for garbage
+    /// collection and table inspection
+    pub timestamp_ms: i64,
+    /// The location of a manifest list for this snapshot that
+    /// tracks manifest files with additional metadata.
+    pub manifest_list: String,
+    /// A string map that summarizes the snapshot changes, including operation.
+    pub summary: Summary,
+    /// ID of the table's current schema when the snapshot was created.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub schema_id: Option<SchemaId>,
+}
+
+impl Snapshot {
+    /// Create a new snapshot builder.
+    pub fn builder() -> SnapshotBuilder {
+        SnapshotBuilder::new()
+    }
+
+    /// Get the id of the snapshot
+    #[inline]
+    pub fn snapshot_id(&self) -> i64 {
+        self.snapshot_id
+    }
+
+    /// Get parent snapshot id.
+    #[inline]
+    pub fn parent_snapshot_id(&self) -> Option<i64> {
+        self.parent_snapshot_id
+    }
+
+    /// Get sequence_number of the snapshot. Is 0 for Iceberg V1 tables.
+    #[inline]
+    pub fn sequence_number(&self) -> i64 {
+        self.sequence_number
+    }
+
+    /// Get location of manifest_list file
+    #[inline]
+    pub fn manifest_list(&self) -> &str {
+        &self.manifest_list
+    }
+
+    /// Get summary of the snapshot
+    #[inline]
+    pub fn summary(&self) -> &Summary {
+        &self.summary
+    }
+
+    /// Get the timestamp of when the snapshot was created
+    #[inline]
+    pub fn timestamp(&self) -> Result<DateTime<Utc>, String> {
+        DateTime::from_timestamp_millis(self.timestamp_ms)
+            .ok_or_else(|| format!("Invalid timestamp: {}", self.timestamp_ms))
+    }
+
+    /// Get the timestamp of when the snapshot was created in milliseconds
+    #[inline]
+    pub fn timestamp_ms(&self) -> i64 {
+        self.timestamp_ms
+    }
+
+    /// Get the schema id of this snapshot.
+    #[inline]
+    pub fn schema_id(&self) -> Option<SchemaId> {
+        self.schema_id
+    }
+}
+
+/// Builder for creating snapshots.
+#[derive(Debug)]
+pub struct SnapshotBuilder {
+    snapshot_id: i64,
+    parent_snapshot_id: Option<i64>,
+    sequence_number: i64,
+    timestamp_ms: i64,
+    manifest_list: Option<String>,
+    summary: Option<Summary>,
+    schema_id: Option<SchemaId>,
+}
+
+impl SnapshotBuilder {
+    /// Create a new snapshot builder.
+    pub fn new() -> Self {
+        Self {
+            snapshot_id: UNASSIGNED_SNAPSHOT_ID,
+            parent_snapshot_id: None,
+            sequence_number: 0,
+            timestamp_ms: chrono::Utc::now().timestamp_millis(),
+            manifest_list: None,
+            summary: None,
+            schema_id: None,
+        }
+    }
+
+    /// Set the snapshot id.
+    pub fn with_snapshot_id(mut self, snapshot_id: i64) -> Self {
+        self.snapshot_id = snapshot_id;
+        self
+    }
+
+    /// Set the parent snapshot id.
+    pub fn with_parent_snapshot_id(mut self, parent_snapshot_id: i64) -> Self {
+        self.parent_snapshot_id = Some(parent_snapshot_id);
+        self
+    }
+
+    /// Set the sequence number.
+    pub fn with_sequence_number(mut self, sequence_number: i64) -> Self {
+        self.sequence_number = sequence_number;
+        self
+    }
+
+    /// Set the timestamp in milliseconds.
+    pub fn with_timestamp_ms(mut self, timestamp_ms: i64) -> Self {
+        self.timestamp_ms = timestamp_ms;
+        self
+    }
+
+    /// Set the manifest list location.
+    pub fn with_manifest_list(mut self, manifest_list: impl ToString) -> Self {
+        self.manifest_list = Some(manifest_list.to_string());
+        self
+    }
+
+    /// Set the summary.
+    pub fn with_summary(mut self, summary: Summary) -> Self {
+        self.summary = Some(summary);
+        self
+    }
+
+    /// Set the schema id.
+    pub fn with_schema_id(mut self, schema_id: SchemaId) -> Self {
+        self.schema_id = Some(schema_id);
+        self
+    }
+
+    /// Build the snapshot.
+    pub fn build(self) -> Result<Snapshot, String> {
+        let manifest_list = self.manifest_list.ok_or("manifest_list is required")?;
+        let summary = self
+            .summary
+            .unwrap_or_else(|| Summary::new(Operation::Append));
+
+        Ok(Snapshot {
+            snapshot_id: self.snapshot_id,
+            parent_snapshot_id: self.parent_snapshot_id,
+            sequence_number: self.sequence_number,
+            timestamp_ms: self.timestamp_ms,
+            manifest_list,
+            summary,
+            schema_id: self.schema_id,
+        })
+    }
+}
+
+impl Default for SnapshotBuilder {
+    fn default() -> Self {
+        Self::new()
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/table_metadata.rs b/crates/sail-iceberg/src/spec/table_metadata.rs
new file mode 100644
index 0000000000..e1ebac60d9
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/table_metadata.rs
@@ -0,0 +1,161 @@
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+use serde_json::Value as JsonValue;
+use uuid::Uuid;
+
+use super::{FormatVersion, PartitionSpec, Schema, Snapshot};
+
+/// Iceberg table metadata
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub struct TableMetadata {
+    /// Integer Version for the format
+    pub format_version: FormatVersion,
+    /// A UUID that identifies the table
+    pub table_uuid: Option<Uuid>,
+    /// Location tables base location
+    pub location: String,
+    /// The tables highest sequence number
+    #[serde(default)]
+    pub last_sequence_number: i64,
+    /// Timestamp in milliseconds from the unix epoch when the table was last updated
+    pub last_updated_ms: i64,
+    /// An integer; the highest assigned column ID for the table
+    pub last_column_id: i32,
+    /// A list of schemas, stored as objects with schema-id
+    pub schemas: Vec<Schema>,
+    /// ID of the table's current schema
+    pub current_schema_id: i32,
+    /// A list of partition specs, stored as full partition spec objects
+    #[serde(default)]
+    pub partition_specs: Vec<PartitionSpec>,
+    /// ID of the "current" spec that writers should use by default
+    #[serde(default)]
+    pub default_spec_id: i32,
+    /// An integer; the highest assigned partition field ID across all partition specs for the table
+    #[serde(default)]
+    pub last_partition_id: i32,
+    /// A string to string map of table properties
+    #[serde(default)]
+    pub properties: HashMap<String, String>,
+    /// long ID of the current table snapshot
+    pub current_snapshot_id: Option<i64>,
+    /// A list of valid snapshots
+    #[serde(default)]
+    pub snapshots: Vec<Snapshot>,
+    /// A list of timestamp and snapshot ID pairs that encodes changes to the current snapshot for the table
+    #[serde(default)]
+    pub snapshot_log: Vec<SnapshotLog>,
+    /// A list of timestamp and metadata file location pairs that encodes changes to the previous metadata files for the table
+    #[serde(default)]
+    pub metadata_log: Vec<MetadataLog>,
+    /// Sort orders for the table
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub sort_orders: Vec<JsonValue>,
+    /// Default sort order ID
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub default_sort_order_id: Option<i32>,
+    /// Named references to snapshots
+    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
+    pub refs: HashMap<String, JsonValue>,
+    /// Statistics files
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub statistics: Vec<JsonValue>,
+    /// Partition statistics files
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub partition_statistics: Vec<JsonValue>,
+}
+
+/// Snapshot log entry
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub struct SnapshotLog {
+    /// Timestamp when the snapshot became current
+    pub timestamp_ms: i64,
+    /// Snapshot ID
+    pub snapshot_id: i64,
+}
+
+/// Metadata log entry
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub struct MetadataLog {
+    /// Timestamp when the metadata file was created
+    pub timestamp_ms: i64,
+    /// Location of the metadata file
+    pub metadata_file: String,
+}
+
+impl TableMetadata {
+    /// Get the current schema
+    pub fn current_schema(&self) -> Option<&Schema> {
+        self.schemas
+            .iter()
+            .find(|schema| schema.schema_id() == self.current_schema_id)
+    }
+
+    /// Get the current snapshot
+    pub fn current_snapshot(&self) -> Option<&Snapshot> {
+        if let Some(snapshot_id) = self.current_snapshot_id {
+            self.snapshots
+                .iter()
+                .find(|snapshot| snapshot.snapshot_id() == snapshot_id)
+        } else {
+            None
+        }
+    }
+
+    /// Get the default partition spec
+    pub fn default_partition_spec(&self) -> Option<&PartitionSpec> {
+        self.partition_specs
+            .iter()
+            .find(|spec| spec.spec_id() == self.default_spec_id)
+    }
+
+    pub fn from_json(data: &[u8]) -> Result<Self, serde_json::Error> {
+        log::debug!("[ICEBERG] Attempting to parse table metadata JSON");
+
+        match serde_json::from_slice::<serde_json::Value>(data) {
+            Ok(json_value) => {
+                if let Some(obj) = json_value.as_object() {
+                    log::debug!(
+                        "[ICEBERG] JSON fields present: {:?}",
+                        obj.keys().collect::<Vec<_>>()
+                    );
+
+                    if let Some(refs) = obj.get("refs") {
+                        log::debug!("[ICEBERG] refs field: {:?}", refs);
+                    }
+                    if let Some(sort_orders) = obj.get("sort-orders") {
+                        log::debug!("[ICEBERG] sort-orders field: {:?}", sort_orders);
+                    }
+                    if let Some(stats) = obj.get("statistics") {
+                        log::debug!("[ICEBERG] statistics field: {:?}", stats);
+                    }
+                    if let Some(partition_stats) = obj.get("partition-statistics") {
+                        log::debug!(
+                            "[ICEBERG] partition-statistics field: {:?}",
+                            partition_stats
+                        );
+                    }
+                }
+
+                log::debug!("[ICEBERG] Deserializing to TableMetadata struct");
+                serde_json::from_value::<TableMetadata>(json_value).map_err(|e| {
+                    log::error!("[ICEBERG] Failed to deserialize TableMetadata: {:?}", e);
+                    e
+                })
+            }
+            Err(e) => {
+                log::error!("[ICEBERG] Failed to parse as JSON: {:?}", e);
+                Err(e)
+            }
+        }
+    }
+
+    /// Serialize table metadata to JSON bytes
+    pub fn to_json(&self) -> Result<Vec<u8>, serde_json::Error> {
+        serde_json::to_vec(self)
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/transform.rs b/crates/sail-iceberg/src/spec/transform.rs
new file mode 100644
index 0000000000..0653f6a0ef
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/transform.rs
@@ -0,0 +1,212 @@
+use std::fmt::{Display, Formatter};
+use std::str::FromStr;
+
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
+
+use super::datatypes::{PrimitiveType, Type};
+use super::values::Literal;
+
+/// Transform is used to transform predicates to partition predicates,
+/// in addition to transforming data values.
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub enum Transform {
+    /// Source value, unmodified
+    Identity,
+    /// Hash of value, mod `N`.
+    Bucket(u32),
+    /// Value truncated to width `W`
+    Truncate(u32),
+    /// Extract a date or timestamp year, as years from 1970
+    Year,
+    /// Extract a date or timestamp month, as months from 1970-01-01
+    Month,
+    /// Extract a date or timestamp day, as days from 1970-01-01
+    Day,
+    /// Extract a timestamp hour, as hours from 1970-01-01 00:00:00
+    Hour,
+    /// Always produces `null`
+    Void,
+    /// Used to represent some customized transform that can't be recognized or supported now.
+    Unknown,
+}
+
+impl Transform {
+    // TODO: Full value transformation support
+    pub fn to_human_string(self, _field_type: &Type, value: Option<&Literal>) -> String {
+        if let Some(_value) = value {
+            match self {
+                Self::Identity => "identity_value".to_string(),
+                Self::Void => "null".to_string(),
+                _ => "transformed_value".to_string(),
+            }
+        } else {
+            "null".to_string()
+        }
+    }
+
+    /// Get the return type of transform given the input type.
+    /// Returns `None` if it can't be transformed.
+    pub fn result_type(&self, input_type: &Type) -> Result<Type, String> {
+        match self {
+            Transform::Identity => {
+                if matches!(input_type, Type::Primitive(_)) {
+                    Ok(input_type.clone())
+                } else {
+                    Err(format!(
+                        "{input_type} is not a valid input type of identity transform"
+                    ))
+                }
+            }
+            Transform::Void => Ok(input_type.clone()),
+            Transform::Unknown => Ok(Type::Primitive(PrimitiveType::String)),
+            Transform::Bucket(_) => {
+                if let Type::Primitive(p) = input_type {
+                    match p {
+                        PrimitiveType::Int
+                        | PrimitiveType::Long
+                        | PrimitiveType::Decimal { .. }
+                        | PrimitiveType::Date
+                        | PrimitiveType::Time
+                        | PrimitiveType::Timestamp
+                        | PrimitiveType::Timestamptz
+                        | PrimitiveType::TimestampNs
+                        | PrimitiveType::TimestamptzNs
+                        | PrimitiveType::String
+                        | PrimitiveType::Uuid
+                        | PrimitiveType::Fixed(_)
+                        | PrimitiveType::Binary => Ok(Type::Primitive(PrimitiveType::Int)),
+                        _ => Err(format!(
+                            "{input_type} is not a valid input type of bucket transform"
+                        )),
+                    }
+                } else {
+                    Err(format!(
+                        "{input_type} is not a valid input type of bucket transform"
+                    ))
+                }
+            }
+            Transform::Truncate(_) => {
+                if let Type::Primitive(p) = input_type {
+                    match p {
+                        PrimitiveType::Int
+                        | PrimitiveType::Long
+                        | PrimitiveType::Decimal { .. }
+                        | PrimitiveType::String => Ok(input_type.clone()),
+                        _ => Err(format!(
+                            "{input_type} is not a valid input type of truncate transform"
+                        )),
+                    }
+                } else {
+                    Err(format!(
+                        "{input_type} is not a valid input type of truncate transform"
+                    ))
+                }
+            }
+            Transform::Year | Transform::Month | Transform::Day => {
+                if let Type::Primitive(p) = input_type {
+                    match p {
+                        PrimitiveType::Date
+                        | PrimitiveType::Timestamp
+                        | PrimitiveType::Timestamptz
+                        | PrimitiveType::TimestampNs
+                        | PrimitiveType::TimestamptzNs => Ok(Type::Primitive(PrimitiveType::Int)),
+                        _ => Err(format!(
+                            "{input_type} is not a valid input type of date transform"
+                        )),
+                    }
+                } else {
+                    Err(format!(
+                        "{input_type} is not a valid input type of date transform"
+                    ))
+                }
+            }
+            Transform::Hour => {
+                if let Type::Primitive(p) = input_type {
+                    match p {
+                        PrimitiveType::Timestamp
+                        | PrimitiveType::Timestamptz
+                        | PrimitiveType::TimestampNs
+                        | PrimitiveType::TimestamptzNs => Ok(Type::Primitive(PrimitiveType::Int)),
+                        _ => Err(format!(
+                            "{input_type} is not a valid input type of hour transform"
+                        )),
+                    }
+                } else {
+                    Err(format!(
+                        "{input_type} is not a valid input type of hour transform"
+                    ))
+                }
+            }
+        }
+    }
+}
+
+impl Display for Transform {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Transform::Identity => write!(f, "identity"),
+            Transform::Bucket(n) => write!(f, "bucket[{}]", n),
+            Transform::Truncate(w) => write!(f, "truncate[{}]", w),
+            Transform::Year => write!(f, "year"),
+            Transform::Month => write!(f, "month"),
+            Transform::Day => write!(f, "day"),
+            Transform::Hour => write!(f, "hour"),
+            Transform::Void => write!(f, "void"),
+            Transform::Unknown => write!(f, "unknown"),
+        }
+    }
+}
+
+impl FromStr for Transform {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "identity" => Ok(Transform::Identity),
+            "year" => Ok(Transform::Year),
+            "month" => Ok(Transform::Month),
+            "day" => Ok(Transform::Day),
+            "hour" => Ok(Transform::Hour),
+            "void" => Ok(Transform::Void),
+            _ => {
+                if let Some(bucket_str) =
+                    s.strip_prefix("bucket[").and_then(|s| s.strip_suffix(']'))
+                {
+                    let n: u32 = bucket_str
+                        .parse()
+                        .map_err(|_| format!("Invalid bucket parameter: {}", bucket_str))?;
+                    Ok(Transform::Bucket(n))
+                } else if let Some(truncate_str) = s
+                    .strip_prefix("truncate[")
+                    .and_then(|s| s.strip_suffix(']'))
+                {
+                    let w: u32 = truncate_str
+                        .parse()
+                        .map_err(|_| format!("Invalid truncate parameter: {}", truncate_str))?;
+                    Ok(Transform::Truncate(w))
+                } else {
+                    Ok(Transform::Unknown)
+                }
+            }
+        }
+    }
+}
+
+impl Serialize for Transform {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        serializer.serialize_str(&self.to_string())
+    }
+}
+
+impl<'de> Deserialize<'de> for Transform {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let s = String::deserialize(deserializer)?;
+        Transform::from_str(&s).map_err(serde::de::Error::custom)
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/values.rs b/crates/sail-iceberg/src/spec/values.rs
new file mode 100644
index 0000000000..140cc35ae1
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/values.rs
@@ -0,0 +1,57 @@
+use ordered_float::OrderedFloat;
+use serde::{Deserialize, Serialize};
+use serde_json::Value as JsonValue;
+
+/// Literal values used in Iceberg
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum Literal {
+    Primitive(PrimitiveLiteral),
+    Struct(Vec<(String, Option<Literal>)>),
+    List(Vec<Option<Literal>>),
+    Map(Vec<(Literal, Option<Literal>)>),
+}
+
+/// Primitive literal values
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum PrimitiveLiteral {
+    Boolean(bool),
+    Int(i32),
+    Long(i64),
+    Float(OrderedFloat<f32>),
+    Double(OrderedFloat<f64>),
+    Int128(i128),
+    String(String),
+    UInt128(u128),
+    Binary(Vec<u8>),
+}
+
+impl Literal {
+    // TODO: Type-aware JSON conversion
+    pub fn try_from_json(
+        value: JsonValue,
+        _data_type: &crate::spec::Type,
+    ) -> Result<Option<Self>, String> {
+        match value {
+            JsonValue::Null => Ok(None),
+            _ => Ok(Some(Literal::Primitive(PrimitiveLiteral::String(
+                value.to_string(),
+            )))),
+        }
+    }
+
+    // TODO: Type-aware JSON conversion
+    pub fn try_into_json(&self, _data_type: &crate::spec::Type) -> Result<JsonValue, String> {
+        match self {
+            Literal::Primitive(p) => match p {
+                PrimitiveLiteral::Boolean(v) => Ok(JsonValue::Bool(*v)),
+                PrimitiveLiteral::Int(v) => Ok(JsonValue::Number((*v).into())),
+                PrimitiveLiteral::Long(v) => Ok(JsonValue::Number((*v).into())),
+                PrimitiveLiteral::String(v) => Ok(JsonValue::String(v.clone())),
+                _ => Ok(JsonValue::String(format!("{:?}", p))),
+            },
+            _ => Ok(JsonValue::String(format!("{:?}", self))),
+        }
+    }
+}
diff --git a/crates/sail-iceberg/src/table_format.rs b/crates/sail-iceberg/src/table_format.rs
new file mode 100644
index 0000000000..5a2e801766
--- /dev/null
+++ b/crates/sail-iceberg/src/table_format.rs
@@ -0,0 +1,206 @@
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use datafusion::catalog::{Session, TableProvider};
+use datafusion::common::{not_impl_err, plan_err, DataFusionError, Result};
+use datafusion::physical_plan::ExecutionPlan;
+use sail_common_datafusion::datasource::{SinkInfo, SourceInfo, TableFormat};
+use url::Url;
+
+use crate::datasource::provider::IcebergTableProvider;
+use crate::spec::{Schema, Snapshot, TableMetadata};
+
+#[derive(Debug)]
+pub struct IcebergTableFormat;
+
+#[async_trait]
+impl TableFormat for IcebergTableFormat {
+    fn name(&self) -> &str {
+        "iceberg"
+    }
+
+    async fn create_provider(
+        &self,
+        ctx: &dyn Session,
+        info: SourceInfo,
+    ) -> Result<Arc<dyn TableProvider>> {
+        let SourceInfo {
+            paths,
+            schema: _schema,
+            constraints: _,
+            partition_by: _,
+            bucket_by: _,
+            sort_order: _,
+            options: _options,
+        } = info;
+
+        log::info!("[ICEBERG] Creating table provider for paths: {:?}", paths);
+        let table_url = Self::parse_table_url(ctx, paths).await?;
+        log::info!("[ICEBERG] Parsed table URL: {}", table_url);
+
+        let (iceberg_schema, snapshot) = load_table_metadata(ctx, &table_url).await?;
+        log::info!(
+            "[ICEBERG] Loaded metadata, snapshot_id: {}",
+            snapshot.snapshot_id()
+        );
+
+        let provider = IcebergTableProvider::new(table_url.to_string(), iceberg_schema, snapshot)?;
+        Ok(Arc::new(provider))
+    }
+
+    async fn create_writer(
+        &self,
+        _ctx: &dyn Session,
+        _info: SinkInfo,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        not_impl_err!("Writing to Iceberg tables is not yet implemented")
+    }
+}
+
+impl IcebergTableFormat {
+    async fn parse_table_url(ctx: &dyn Session, paths: Vec<String>) -> Result<Url> {
+        if paths.len() != 1 {
+            return plan_err!(
+                "Iceberg table requires exactly one path, got {}",
+                paths.len()
+            );
+        }
+
+        let path = &paths[0];
+        let mut table_url = Url::parse(path).map_err(|e| DataFusionError::External(Box::new(e)))?;
+
+        if !table_url.path().ends_with('/') {
+            table_url.set_path(&format!("{}/", table_url.path()));
+        }
+
+        // Validate that we can access the object store
+        let _object_store = ctx
+            .runtime_env()
+            .object_store_registry
+            .get_store(&table_url)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;
+
+        Ok(table_url)
+    }
+}
+
+/// Load Iceberg table metadata from the table location
+async fn load_table_metadata(ctx: &dyn Session, table_url: &Url) -> Result<(Schema, Snapshot)> {
+    log::debug!("[ICEBERG] Loading table metadata from: {}", table_url);
+    let object_store = ctx
+        .runtime_env()
+        .object_store_registry
+        .get_store(table_url)
+        .map_err(|e| DataFusionError::External(Box::new(e)))?;
+
+    let metadata_location = find_latest_metadata_file(&object_store, table_url).await?;
+    log::info!("[ICEBERG] Found metadata file: {}", metadata_location);
+
+    let metadata_path = object_store::path::Path::from(metadata_location.as_str());
+    let metadata_data = object_store
+        .get(&metadata_path)
+        .await
+        .map_err(|e| DataFusionError::External(Box::new(e)))?
+        .bytes()
+        .await
+        .map_err(|e| DataFusionError::External(Box::new(e)))?;
+
+    log::debug!(
+        "[ICEBERG] Read {} bytes from metadata file",
+        metadata_data.len()
+    );
+
+    let table_metadata = TableMetadata::from_json(&metadata_data).map_err(|e| {
+        log::error!("[ICEBERG] Failed to parse table metadata: {:?}", e);
+        DataFusionError::External(Box::new(e))
+    })?;
+
+    // Get the current schema
+    let schema = table_metadata
+        .current_schema()
+        .ok_or_else(|| {
+            DataFusionError::Plan("No current schema found in table metadata".to_string())
+        })?
+        .clone();
+
+    // Get the current snapshot
+    let snapshot = table_metadata
+        .current_snapshot()
+        .ok_or_else(|| {
+            DataFusionError::Plan("No current snapshot found in table metadata".to_string())
+        })?
+        .clone();
+
+    Ok((schema, snapshot))
+}
+
+/// Find the latest metadata file in the table location
+async fn find_latest_metadata_file(
+    object_store: &Arc<dyn object_store::ObjectStore>,
+    table_url: &Url,
+) -> Result<String> {
+    use futures::TryStreamExt;
+    use object_store::path::Path as ObjectPath;
+
+    log::debug!("[ICEBERG] Finding latest metadata file");
+    let version_hint_path =
+        ObjectPath::from(format!("{}metadata/version-hint.text", table_url.path()).as_str());
+
+    if let Ok(version_hint_data) = object_store.get(&version_hint_path).await {
+        if let Ok(version_hint_bytes) = version_hint_data.bytes().await {
+            if let Ok(version_hint) = String::from_utf8(version_hint_bytes.to_vec()) {
+                let version = version_hint.trim().parse::<i32>().unwrap_or(0);
+                let metadata_file =
+                    format!("{}/metadata/v{}.metadata.json", table_url.path(), version);
+                log::debug!("[ICEBERG] Using version hint: {}", version);
+                return Ok(metadata_file);
+            }
+        }
+    }
+
+    log::debug!("[ICEBERG] No version hint, listing metadata directory");
+    let metadata_prefix = ObjectPath::from(format!("{}metadata/", table_url.path()).as_str());
+    let objects = object_store.list(Some(&metadata_prefix));
+
+    let metadata_files: Result<Vec<_>, _> = objects
+        .try_filter_map(|obj| async move {
+            let path_str = obj.location.to_string();
+            if path_str.ends_with(".metadata.json") {
+                if let Some(filename) = path_str.split('/').next_back() {
+                    // Try new format first: 00001-uuid.metadata.json
+                    if let Some(version_part) = filename.split('-').next() {
+                        if let Ok(version) = version_part.parse::<i32>() {
+                            return Ok(Some((version, path_str, obj.last_modified)));
+                        }
+                    }
+                    // Try old format: v123.metadata.json
+                    if let Some(version_str) = filename
+                        .strip_prefix('v')
+                        .and_then(|s| s.strip_suffix(".metadata.json"))
+                    {
+                        if let Ok(version) = version_str.parse::<i32>() {
+                            return Ok(Some((version, path_str, obj.last_modified)));
+                        }
+                    }
+                }
+            }
+            Ok(None)
+        })
+        .try_collect()
+        .await;
+
+    match metadata_files {
+        Ok(mut files) => {
+            files.sort_by_key(|(version, _, _)| *version);
+
+            if let Some((_, latest_file, _)) = files.last() {
+                Ok(latest_file.clone())
+            } else {
+                plan_err!("No metadata files found in table location: {}", table_url)
+            }
+        }
+        Err(e) => {
+            plan_err!("Failed to list metadata directory: {}", e)
+        }
+    }
+}
diff --git a/pyproject.toml b/pyproject.toml
index 3bdc81b967..018cecb96b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ test = [
     "duckdb>=1.0,<2",
     "pytest>=8.4,<9",
     "pillow>=10.3.0",
+    "pyiceberg[sql-sqlite]>=0.8,<1",
 ]
 mcp = [
     "mcp>=1.0.0,<2",
@@ -76,6 +77,7 @@ dependencies = [
     "mcp>=1.0,<2",
     "boto3>=1.38,<2",
     "pillow>=10.3.0",
+    "pyiceberg[sql-sqlite]>=0.8,<1",
 ]
 path = ".venvs/default"
 
@@ -114,6 +116,7 @@ dependencies = [
     "pytest>=8.4,<9",
     "duckdb>=1.1,<2",
     "pillow>=10.3.0",
+    "pyiceberg[sql-sqlite]>=0.8,<1",
 ]
 
 [[tool.hatch.envs.test.matrix]]
@@ -140,6 +143,7 @@ dependencies = [
     "pytest-xdist>=3.7,<4",
     "pytest-timeout>=2.4,<3",
     "pytest-reportlog>=0.4,<0.5",
+    "pyiceberg[sql-sqlite]>=0.8,<1",
 ]
 
 [[tool.hatch.envs.test-spark.matrix]]
diff --git a/python/pysail/tests/spark/iceberg/__init__.py b/python/pysail/tests/spark/iceberg/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_io.py b/python/pysail/tests/spark/iceberg/test_iceberg_io.py
new file mode 100644
index 0000000000..bcb7e40aaf
--- /dev/null
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_io.py
@@ -0,0 +1,172 @@
+import pandas as pd
+import pyarrow as pa
+import pytest
+from pandas.testing import assert_frame_equal
+from pyiceberg.catalog import load_catalog
+from pyiceberg.schema import Schema
+from pyiceberg.types import DoubleType, LongType, NestedField, StringType
+from pyspark.sql.types import Row
+
+from ..utils import get_data_files
+
+
+class TestIcebergIO:
+    @pytest.fixture(scope="class")
+    def iceberg_test_data(self):
+        return [
+            {"id": 10, "event": "A", "score": 0.98},
+            {"id": 11, "event": "B", "score": 0.54},
+            {"id": 12, "event": "A", "score": 0.76},
+        ]
+
+    @pytest.fixture(scope="class")
+    def expected_pandas_df(self):
+        return pd.DataFrame({"id": [10, 11, 12], "event": ["A", "B", "A"], "score": [0.98, 0.54, 0.76]}).astype(
+            {"id": "int64", "event": "string", "score": "float64"}
+        )
+
+    def test_iceberg_io_basic_read(self, spark, iceberg_test_data, expected_pandas_df, tmp_path):
+        warehouse_path = tmp_path / "warehouse"
+        warehouse_path.mkdir()
+        table_name = "test_table"
+
+        catalog = load_catalog(
+            "test_catalog",
+            **{
+                "type": "sql",
+                "uri": f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
+                "warehouse": f"file://{warehouse_path}",
+            },
+        )
+
+        catalog.create_namespace("default")
+
+        schema = Schema(
+            NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+            NestedField(field_id=2, name="event", field_type=StringType(), required=False),
+            NestedField(field_id=3, name="score", field_type=DoubleType(), required=False),
+        )
+
+        table = catalog.create_table(
+            identifier=f"default.{table_name}",
+            schema=schema,
+        )
+
+        try:
+            df = pd.DataFrame(iceberg_test_data)
+            arrow_table = pa.Table.from_pandas(df)
+            table.append(arrow_table)
+
+            table_location = table.metadata_location
+            # TODO: Keep file:// prefix for Sail, just remove /metadata/... suffix
+            table_path = table_location.rsplit("/metadata/", 1)[0]
+
+            result_df = spark.read.format("iceberg").load(table_path).sort("id")
+
+            assert_frame_equal(
+                result_df.toPandas(), expected_pandas_df.sort_values("id").reset_index(drop=True), check_dtype=False
+            )
+        finally:
+            catalog.drop_table(f"default.{table_name}")
+
+    def test_iceberg_io_read_with_sql(self, spark, iceberg_test_data, expected_pandas_df, tmp_path):
+        warehouse_path = tmp_path / "warehouse"
+        warehouse_path.mkdir()
+        table_name = "test_table_sql"
+
+        catalog = load_catalog(
+            "test_catalog",
+            **{
+                "type": "sql",
+                "uri": f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
+                "warehouse": f"file://{warehouse_path}",
+            },
+        )
+
+        catalog.create_namespace("default")
+
+        schema = Schema(
+            NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+            NestedField(field_id=2, name="event", field_type=StringType(), required=False),
+            NestedField(field_id=3, name="score", field_type=DoubleType(), required=False),
+        )
+
+        table = catalog.create_table(
+            identifier=f"default.{table_name}",
+            schema=schema,
+        )
+
+        try:
+            df = pd.DataFrame(iceberg_test_data)
+            arrow_table = pa.Table.from_pandas(df)
+            table.append(arrow_table)
+
+            table_location = table.metadata_location
+            # TODO: Keep file:// prefix for Sail, just remove /metadata/... suffix
+            table_path = table_location.rsplit("/metadata/", 1)[0]
+
+            spark.sql(f"CREATE TABLE my_iceberg USING iceberg LOCATION '{table_path}'")
+
+            try:
+                result_df = spark.sql("SELECT * FROM my_iceberg").sort("id")
+
+                assert_frame_equal(
+                    result_df.toPandas(), expected_pandas_df.sort_values("id").reset_index(drop=True), check_dtype=False
+                )
+            finally:
+                spark.sql("DROP TABLE IF EXISTS my_iceberg")
+        finally:
+            catalog.drop_table(f"default.{table_name}")
+
+    def test_iceberg_io_multiple_files(self, spark, tmp_path):
+        warehouse_path = tmp_path / "warehouse"
+        warehouse_path.mkdir()
+        table_name = "test_table_multiple"
+
+        catalog = load_catalog(
+            "test_catalog",
+            **{
+                "type": "sql",
+                "uri": f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
+                "warehouse": f"file://{warehouse_path}",
+            },
+        )
+
+        catalog.create_namespace("default")
+
+        schema = Schema(
+            NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+            NestedField(field_id=2, name="value", field_type=StringType(), required=False),
+        )
+
+        table = catalog.create_table(
+            identifier=f"default.{table_name}",
+            schema=schema,
+        )
+
+        try:
+            df1 = pd.DataFrame([{"id": 1, "value": "a"}, {"id": 2, "value": "b"}])
+            arrow_table1 = pa.Table.from_pandas(df1)
+            table.append(arrow_table1)
+
+            df2 = pd.DataFrame([{"id": 3, "value": "c"}, {"id": 4, "value": "d"}])
+            arrow_table2 = pa.Table.from_pandas(df2)
+            table.append(arrow_table2)
+
+            table_location = table.metadata_location
+            # TODO: Keep file:// prefix for Sail, just remove /metadata/... suffix
+            table_path = table_location.rsplit("/metadata/", 1)[0]
+
+            result_df = spark.read.format("iceberg").load(table_path).sort("id")
+
+            expected_data = pd.DataFrame({"id": [1, 2, 3, 4], "value": ["a", "b", "c", "d"]}).astype(
+                {"id": "int64", "value": "string"}
+            )
+
+            assert_frame_equal(
+                result_df.toPandas(), expected_data.sort_values("id").reset_index(drop=True), check_dtype=False
+            )
+
+            assert result_df.count() == 4
+        finally:
+            catalog.drop_table(f"default.{table_name}")

From d3f4eedbf5754e88374f01c42db90c9f03e8dd21 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Tue, 30 Sep 2025 17:03:10 +0800
Subject: [PATCH 02/32] update

---
 Cargo.lock                                    |  2 +-
 .../tests/spark/iceberg/test_iceberg_io.py    | 29 +++++++------------
 2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 134c48f992..58a9f13fa9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5707,7 +5707,7 @@ dependencies = [
 
 [[package]]
 name = "sail-iceberg"
-version = "0.3.5"
+version = "0.3.6"
 dependencies = [
  "apache-avro",
  "arrow-schema",
diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_io.py b/python/pysail/tests/spark/iceberg/test_iceberg_io.py
index bcb7e40aaf..cee0fe5b68 100644
--- a/python/pysail/tests/spark/iceberg/test_iceberg_io.py
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_io.py
@@ -5,9 +5,6 @@
 from pyiceberg.catalog import load_catalog
 from pyiceberg.schema import Schema
 from pyiceberg.types import DoubleType, LongType, NestedField, StringType
-from pyspark.sql.types import Row
-
-from ..utils import get_data_files
 
 
 class TestIcebergIO:
@@ -32,11 +29,9 @@ def test_iceberg_io_basic_read(self, spark, iceberg_test_data, expected_pandas_d
 
         catalog = load_catalog(
             "test_catalog",
-            **{
-                "type": "sql",
-                "uri": f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
-                "warehouse": f"file://{warehouse_path}",
-            },
+            type="sql",
+            uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
+            warehouse=f"file://{warehouse_path}",
         )
 
         catalog.create_namespace("default")
@@ -76,11 +71,9 @@ def test_iceberg_io_read_with_sql(self, spark, iceberg_test_data, expected_panda
 
         catalog = load_catalog(
             "test_catalog",
-            **{
-                "type": "sql",
-                "uri": f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
-                "warehouse": f"file://{warehouse_path}",
-            },
+            type="sql",
+            uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
+            warehouse=f"file://{warehouse_path}",
         )
 
         catalog.create_namespace("default")
@@ -125,11 +118,9 @@ def test_iceberg_io_multiple_files(self, spark, tmp_path):
 
         catalog = load_catalog(
             "test_catalog",
-            **{
-                "type": "sql",
-                "uri": f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
-                "warehouse": f"file://{warehouse_path}",
-            },
+            type="sql",
+            uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
+            warehouse=f"file://{warehouse_path}",
         )
 
         catalog.create_namespace("default")
@@ -167,6 +158,6 @@ def test_iceberg_io_multiple_files(self, spark, tmp_path):
                 result_df.toPandas(), expected_data.sort_values("id").reset_index(drop=True), check_dtype=False
             )
 
-            assert result_df.count() == 4
+            assert result_df.count() == 4  # noqa: PLR2004
         finally:
             catalog.drop_table(f"default.{table_name}")

From 3b45d109c3908642227aade61c5b82245806abd0 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Thu, 9 Oct 2025 11:30:00 +0800
Subject: [PATCH 03/32] update

---
 .../sail-iceberg/src/datasource/provider.rs   | 17 ++--
 crates/sail-iceberg/src/spec/manifest.rs      | 70 +++++++++++++-
 crates/sail-iceberg/src/spec/manifest_list.rs | 62 +++++++------
 crates/sail-iceberg/src/spec/mod.rs           |  6 ++
 crates/sail-iceberg/src/spec/name_mapping.rs  | 62 +++++++++++++
 crates/sail-iceberg/src/spec/snapshot.rs      | 56 +++++++++++-
 crates/sail-iceberg/src/spec/sort.rs          | 91 +++++++++++++++++++
 crates/sail-iceberg/src/spec/statistics.rs    | 46 ++++++++++
 .../sail-iceberg/src/spec/table_metadata.rs   | 14 +--
 crates/sail-iceberg/src/spec/values.rs        | 16 ++++
 10 files changed, 397 insertions(+), 43 deletions(-)
 create mode 100644 crates/sail-iceberg/src/spec/name_mapping.rs
 create mode 100644 crates/sail-iceberg/src/spec/sort.rs
 create mode 100644 crates/sail-iceberg/src/spec/statistics.rs

diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
index 540c368b3c..dc7417bef0 100644
--- a/crates/sail-iceberg/src/datasource/provider.rs
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -520,12 +520,12 @@ impl From<ManifestFileAvro> for ManifestFile {
             sequence_number: avro.sequence_number,
             min_sequence_number: avro.min_sequence_number,
             added_snapshot_id: avro.added_snapshot_id,
-            added_files_count: avro.added_files_count,
-            existing_files_count: avro.existing_files_count,
-            deleted_files_count: avro.deleted_files_count,
-            added_rows_count: avro.added_rows_count,
-            existing_rows_count: avro.existing_rows_count,
-            deleted_rows_count: avro.deleted_rows_count,
+            added_files_count: Some(avro.added_files_count),
+            existing_files_count: Some(avro.existing_files_count),
+            deleted_files_count: Some(avro.deleted_files_count),
+            added_rows_count: Some(avro.added_rows_count),
+            existing_rows_count: Some(avro.existing_rows_count),
+            deleted_rows_count: Some(avro.deleted_rows_count),
             partitions,
             key_metadata: avro.key_metadata,
         }
@@ -779,6 +779,7 @@ impl DataFileAvro {
             nan_value_counts,
             lower_bounds,
             upper_bounds,
+            block_size_in_bytes: None,
             key_metadata: self.key_metadata,
             split_offsets: self.split_offsets.unwrap_or_default(),
             equality_ids: self
@@ -788,7 +789,11 @@ impl DataFileAvro {
                 .map(|v| v as i32)
                 .collect(),
             sort_order_id: self.sort_order_id,
+            first_row_id: None,
             partition_spec_id,
+            referenced_data_file: None,
+            content_offset: None,
+            content_size_in_bytes: None,
         }
     }
 }
diff --git a/crates/sail-iceberg/src/spec/manifest.rs b/crates/sail-iceberg/src/spec/manifest.rs
index e05324417b..ac8b29e71f 100644
--- a/crates/sail-iceberg/src/spec/manifest.rs
+++ b/crates/sail-iceberg/src/spec/manifest.rs
@@ -156,9 +156,14 @@ pub struct ManifestEntry {
     /// The status of the data file.
     pub status: ManifestStatus,
     /// The snapshot ID when the data file was added to the table.
-    pub snapshot_id: i64,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub snapshot_id: Option<i64>,
     /// The sequence number when the data file was added to the table.
-    pub sequence_number: i64,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub sequence_number: Option<i64>,
+    /// The file sequence number indicating when the file was added.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub file_sequence_number: Option<i64>,
     /// The data file.
     pub data_file: DataFile,
 }
@@ -167,14 +172,16 @@ impl ManifestEntry {
     /// Create a new manifest entry.
     pub fn new(
         status: ManifestStatus,
-        snapshot_id: i64,
-        sequence_number: i64,
+        snapshot_id: Option<i64>,
+        sequence_number: Option<i64>,
+        file_sequence_number: Option<i64>,
         data_file: DataFile,
     ) -> Self {
         Self {
             status,
             snapshot_id,
             sequence_number,
+            file_sequence_number,
             data_file,
         }
     }
@@ -213,6 +220,9 @@ pub struct DataFile {
     /// Map from column id to upper bound in the column.
     #[serde(skip_serializing_if = "HashMap::is_empty")]
     pub upper_bounds: HashMap<i32, Literal>,
+    /// Block size in bytes.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub block_size_in_bytes: Option<i64>,
     /// Implementation-specific key metadata for encryption.
     #[serde(skip_serializing_if = "Option::is_none")]
     pub key_metadata: Option<Vec<u8>>,
@@ -225,8 +235,20 @@ pub struct DataFile {
     /// ID representing sort order for this file.
     #[serde(skip_serializing_if = "Option::is_none")]
     pub sort_order_id: Option<i32>,
+    /// The _row_id for the first row in the data file.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub first_row_id: Option<i64>,
     /// The partition spec id used when writing this data file.
     pub partition_spec_id: i32,
+    /// Fully qualified location of a data file that all deletes reference.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub referenced_data_file: Option<String>,
+    /// The offset in the file where the content starts (for deletion vectors).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub content_offset: Option<i64>,
+    /// The size of the referenced content in bytes (for deletion vectors).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub content_size_in_bytes: Option<i64>,
 }
 
 impl DataFile {
@@ -311,11 +333,16 @@ pub struct DataFileBuilder {
     nan_value_counts: HashMap<i32, u64>,
     lower_bounds: HashMap<i32, Literal>,
     upper_bounds: HashMap<i32, Literal>,
+    block_size_in_bytes: Option<i64>,
     key_metadata: Option<Vec<u8>>,
     split_offsets: Vec<i64>,
     equality_ids: Vec<i32>,
     sort_order_id: Option<i32>,
+    first_row_id: Option<i64>,
     partition_spec_id: i32,
+    referenced_data_file: Option<String>,
+    content_offset: Option<i64>,
+    content_size_in_bytes: Option<i64>,
 }
 
 impl DataFileBuilder {
@@ -334,11 +361,16 @@ impl DataFileBuilder {
             nan_value_counts: HashMap::new(),
             lower_bounds: HashMap::new(),
             upper_bounds: HashMap::new(),
+            block_size_in_bytes: None,
             key_metadata: None,
             split_offsets: Vec::new(),
             equality_ids: Vec::new(),
             sort_order_id: None,
+            first_row_id: None,
             partition_spec_id: 0,
+            referenced_data_file: None,
+            content_offset: None,
+            content_size_in_bytes: None,
         }
     }
 
@@ -414,6 +446,31 @@ impl DataFileBuilder {
         self
     }
 
+    /// Set the block size in bytes.
+    pub fn with_block_size_in_bytes(mut self, block_size_in_bytes: i64) -> Self {
+        self.block_size_in_bytes = Some(block_size_in_bytes);
+        self
+    }
+
+    /// Set the first row id.
+    pub fn with_first_row_id(mut self, first_row_id: i64) -> Self {
+        self.first_row_id = Some(first_row_id);
+        self
+    }
+
+    /// Set the referenced data file path.
+    pub fn with_referenced_data_file(mut self, path: impl ToString) -> Self {
+        self.referenced_data_file = Some(path.to_string());
+        self
+    }
+
+    /// Set the content offset and size in bytes.
+    pub fn with_content_offset_and_size(mut self, offset: i64, size_in_bytes: i64) -> Self {
+        self.content_offset = Some(offset);
+        self.content_size_in_bytes = Some(size_in_bytes);
+        self
+    }
+
     /// Build the data file.
     pub fn build(self) -> Result<DataFile, String> {
         let file_path = self.file_path.ok_or("file_path is required")?;
@@ -431,11 +488,16 @@ impl DataFileBuilder {
             nan_value_counts: self.nan_value_counts,
             lower_bounds: self.lower_bounds,
             upper_bounds: self.upper_bounds,
+            block_size_in_bytes: self.block_size_in_bytes,
             key_metadata: self.key_metadata,
             split_offsets: self.split_offsets,
             equality_ids: self.equality_ids,
             sort_order_id: self.sort_order_id,
+            first_row_id: self.first_row_id,
             partition_spec_id: self.partition_spec_id,
+            referenced_data_file: self.referenced_data_file,
+            content_offset: self.content_offset,
+            content_size_in_bytes: self.content_size_in_bytes,
         })
     }
 }
diff --git a/crates/sail-iceberg/src/spec/manifest_list.rs b/crates/sail-iceberg/src/spec/manifest_list.rs
index b7559d460a..12cb84d6de 100644
--- a/crates/sail-iceberg/src/spec/manifest_list.rs
+++ b/crates/sail-iceberg/src/spec/manifest_list.rs
@@ -81,17 +81,23 @@ pub struct ManifestFile {
     /// The snapshot ID when the manifest was added to the table.
     pub added_snapshot_id: i64,
     /// The number of files added in this manifest.
-    pub added_files_count: i32,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub added_files_count: Option<i32>,
     /// The number of existing files in this manifest.
-    pub existing_files_count: i32,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub existing_files_count: Option<i32>,
     /// The number of deleted files in this manifest.
-    pub deleted_files_count: i32,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub deleted_files_count: Option<i32>,
     /// The number of rows added in this manifest.
-    pub added_rows_count: i64,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub added_rows_count: Option<i64>,
     /// The number of existing rows in this manifest.
-    pub existing_rows_count: i64,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub existing_rows_count: Option<i64>,
     /// The number of deleted rows in this manifest.
-    pub deleted_rows_count: i64,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub deleted_rows_count: Option<i64>,
     /// A list of field summaries for each partition field in the spec.
     /// Each field in the list corresponds to a field in the manifest file's partition spec.
     #[serde(skip_serializing_if = "Option::is_none")]
@@ -109,12 +115,16 @@ impl ManifestFile {
 
     /// Get the total number of files in this manifest.
     pub fn total_files_count(&self) -> i32 {
-        self.added_files_count + self.existing_files_count + self.deleted_files_count
+        self.added_files_count.unwrap_or(0)
+            + self.existing_files_count.unwrap_or(0)
+            + self.deleted_files_count.unwrap_or(0)
     }
 
     /// Get the total number of rows in this manifest.
     pub fn total_rows_count(&self) -> i64 {
-        self.added_rows_count + self.existing_rows_count + self.deleted_rows_count
+        self.added_rows_count.unwrap_or(0)
+            + self.existing_rows_count.unwrap_or(0)
+            + self.deleted_rows_count.unwrap_or(0)
     }
 }
 
@@ -175,12 +185,12 @@ pub struct ManifestFileBuilder {
     sequence_number: i64,
     min_sequence_number: i64,
     added_snapshot_id: i64,
-    added_files_count: i32,
-    existing_files_count: i32,
-    deleted_files_count: i32,
-    added_rows_count: i64,
-    existing_rows_count: i64,
-    deleted_rows_count: i64,
+    added_files_count: Option<i32>,
+    existing_files_count: Option<i32>,
+    deleted_files_count: Option<i32>,
+    added_rows_count: Option<i64>,
+    existing_rows_count: Option<i64>,
+    deleted_rows_count: Option<i64>,
     partitions: Option<Vec<FieldSummary>>,
     key_metadata: Option<Vec<u8>>,
 }
@@ -196,12 +206,12 @@ impl ManifestFileBuilder {
             sequence_number: UNASSIGNED_SEQUENCE_NUMBER,
             min_sequence_number: UNASSIGNED_SEQUENCE_NUMBER,
             added_snapshot_id: 0,
-            added_files_count: 0,
-            existing_files_count: 0,
-            deleted_files_count: 0,
-            added_rows_count: 0,
-            existing_rows_count: 0,
-            deleted_rows_count: 0,
+            added_files_count: None,
+            existing_files_count: None,
+            deleted_files_count: None,
+            added_rows_count: None,
+            existing_rows_count: None,
+            deleted_rows_count: None,
             partitions: None,
             key_metadata: None,
         }
@@ -251,17 +261,17 @@ impl ManifestFileBuilder {
 
     /// Set the file counts.
     pub fn with_file_counts(mut self, added: i32, existing: i32, deleted: i32) -> Self {
-        self.added_files_count = added;
-        self.existing_files_count = existing;
-        self.deleted_files_count = deleted;
+        self.added_files_count = Some(added);
+        self.existing_files_count = Some(existing);
+        self.deleted_files_count = Some(deleted);
         self
     }
 
     /// Set the row counts.
     pub fn with_row_counts(mut self, added: i64, existing: i64, deleted: i64) -> Self {
-        self.added_rows_count = added;
-        self.existing_rows_count = existing;
-        self.deleted_rows_count = deleted;
+        self.added_rows_count = Some(added);
+        self.existing_rows_count = Some(existing);
+        self.deleted_rows_count = Some(deleted);
         self
     }
 
diff --git a/crates/sail-iceberg/src/spec/mod.rs b/crates/sail-iceberg/src/spec/mod.rs
index ded3e76444..3fe8620414 100644
--- a/crates/sail-iceberg/src/spec/mod.rs
+++ b/crates/sail-iceberg/src/spec/mod.rs
@@ -1,9 +1,12 @@
 pub mod datatypes;
 pub mod manifest;
 pub mod manifest_list;
+pub mod name_mapping;
 pub mod partition;
 pub mod schema;
 pub mod snapshot;
+pub mod sort;
+pub mod statistics;
 pub mod table_metadata;
 pub mod transform;
 pub mod values;
@@ -11,9 +14,12 @@ pub mod values;
 pub use datatypes::*;
 pub use manifest::*;
 pub use manifest_list::*;
+pub use name_mapping::*;
 pub use partition::*;
 pub use schema::*;
 pub use snapshot::*;
+pub use sort::*;
+pub use statistics::*;
 pub use table_metadata::*;
 pub use transform::*;
 pub use values::*;
diff --git a/crates/sail-iceberg/src/spec/name_mapping.rs b/crates/sail-iceberg/src/spec/name_mapping.rs
new file mode 100644
index 0000000000..858b73f9d8
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/name_mapping.rs
@@ -0,0 +1,62 @@
+use std::sync::Arc;
+
+use serde::{Deserialize, Serialize};
+
+/// Default schema name mapping property key
+pub const DEFAULT_SCHEMA_NAME_MAPPING: &str = "schema.name-mapping.default";
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "kebab-case")]
+pub struct NameMapping {
+    #[serde(rename = "fields")]
+    root: Vec<MappedField>,
+}
+
+impl NameMapping {
+    /// Create a new `NameMapping` given mapped fields.
+    pub fn new(fields: Vec<MappedField>) -> Self {
+        Self { root: fields }
+    }
+
+    /// Returns mapped fields
+    pub fn fields(&self) -> &[MappedField] {
+        &self.root
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "kebab-case")]
+pub struct MappedField {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    field_id: Option<i32>,
+    names: Vec<String>,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Vec::is_empty")]
+    fields: Vec<Arc<MappedField>>,
+}
+
+impl MappedField {
+    /// Create a new `MappedField`.
+    pub fn new(field_id: Option<i32>, names: Vec<String>, fields: Vec<MappedField>) -> Self {
+        Self {
+            field_id,
+            names,
+            fields: fields.into_iter().map(Arc::new).collect(),
+        }
+    }
+
+    /// Optional field id
+    pub fn field_id(&self) -> Option<i32> {
+        self.field_id
+    }
+
+    /// All names for this field
+    pub fn names(&self) -> &[String] {
+        &self.names
+    }
+
+    /// Child mapped fields
+    pub fn fields(&self) -> &[Arc<MappedField>] {
+        &self.fields
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/snapshot.rs b/crates/sail-iceberg/src/spec/snapshot.rs
index 237cf1a21f..84a44749b7 100644
--- a/crates/sail-iceberg/src/spec/snapshot.rs
+++ b/crates/sail-iceberg/src/spec/snapshot.rs
@@ -91,7 +91,11 @@ pub struct Snapshot {
     pub timestamp_ms: i64,
     /// The location of a manifest list for this snapshot that
     /// tracks manifest files with additional metadata.
+    #[serde(default)]
     pub manifest_list: String,
+    /// V1 snapshots list manifests directly instead of a manifest list file.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub manifests: Option<Vec<String>>,
     /// A string map that summarizes the snapshot changes, including operation.
     pub summary: Summary,
     /// ID of the table's current schema when the snapshot was created.
@@ -99,6 +103,48 @@ pub struct Snapshot {
     pub schema_id: Option<SchemaId>,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "kebab-case")]
+/// A reference’s snapshot and retention policy
+pub struct SnapshotReference {
+    /// A reference’s snapshot ID. The tagged snapshot or latest snapshot of a branch.
+    pub snapshot_id: i64,
+    #[serde(flatten)]
+    /// Snapshot retention policy
+    pub retention: SnapshotRetention,
+}
+
+impl SnapshotReference {
+    /// Returns true if the snapshot reference is a branch.
+    pub fn is_branch(&self) -> bool {
+        matches!(self.retention, SnapshotRetention::Branch { .. })
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "kebab-case", tag = "type")]
+/// Snapshot retention policy
+pub enum SnapshotRetention {
+    /// Branches are mutable named references that can be updated by committing a new snapshot
+    Branch {
+        /// Minimum number of snapshots to keep in a branch while expiring snapshots.
+        #[serde(skip_serializing_if = "Option::is_none")]
+        min_snapshots_to_keep: Option<i32>,
+        /// Max age of snapshots to keep when expiring, including the latest snapshot.
+        #[serde(skip_serializing_if = "Option::is_none")]
+        max_snapshot_age_ms: Option<i64>,
+        /// Max age of the snapshot reference to keep while expiring snapshots.
+        #[serde(skip_serializing_if = "Option::is_none")]
+        max_ref_age_ms: Option<i64>,
+    },
+    /// Tags are labels for individual snapshots.
+    Tag {
+        /// Max age of the snapshot reference to keep while expiring snapshots.
+        #[serde(skip_serializing_if = "Option::is_none")]
+        max_ref_age_ms: Option<i64>,
+    },
+}
+
 impl Snapshot {
     /// Create a new snapshot builder.
     pub fn builder() -> SnapshotBuilder {
@@ -129,6 +175,12 @@ impl Snapshot {
         &self.manifest_list
     }
 
+    /// Get V1 manifests list if present
+    #[inline]
+    pub fn manifests(&self) -> Option<&[String]> {
+        self.manifests.as_deref()
+    }
+
     /// Get summary of the snapshot
     #[inline]
     pub fn summary(&self) -> &Summary {
@@ -225,7 +277,8 @@ impl SnapshotBuilder {
 
     /// Build the snapshot.
     pub fn build(self) -> Result<Snapshot, String> {
-        let manifest_list = self.manifest_list.ok_or("manifest_list is required")?;
+        // For V1 compatibility allow manifest_list to be missing when manifests provided
+        let manifest_list = self.manifest_list.unwrap_or_default();
         let summary = self
             .summary
             .unwrap_or_else(|| Summary::new(Operation::Append));
@@ -236,6 +289,7 @@ impl SnapshotBuilder {
             sequence_number: self.sequence_number,
             timestamp_ms: self.timestamp_ms,
             manifest_list,
+            manifests: None,
             summary,
             schema_id: self.schema_id,
         })
diff --git a/crates/sail-iceberg/src/spec/sort.rs b/crates/sail-iceberg/src/spec/sort.rs
new file mode 100644
index 0000000000..e7f19da384
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/sort.rs
@@ -0,0 +1,91 @@
+use std::fmt::{Display, Formatter};
+
+use serde::{Deserialize, Serialize};
+
+use super::transform::Transform;
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Copy, Clone)]
+/// Sort direction in a partition, either ascending or descending
+pub enum SortDirection {
+    /// Ascending
+    #[serde(rename = "asc")]
+    Ascending,
+    /// Descending
+    #[serde(rename = "desc")]
+    Descending,
+}
+
+impl Display for SortDirection {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            SortDirection::Ascending => write!(f, "asc"),
+            SortDirection::Descending => write!(f, "desc"),
+        }
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Copy, Clone)]
+pub enum NullOrder {
+    #[serde(rename = "nulls-first")]
+    /// Nulls are stored first
+    First,
+    #[serde(rename = "nulls-last")]
+    /// Nulls are stored last
+    Last,
+}
+
+impl Display for NullOrder {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            NullOrder::First => write!(f, "nulls-first"),
+            NullOrder::Last => write!(f, "nulls-last"),
+        }
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "kebab-case")]
+pub struct SortField {
+    /// A source column id from the table’s schema
+    pub source_id: i32,
+    /// A transform that is used to produce values to be sorted on from the source column.
+    pub transform: Transform,
+    /// A sort direction, that can only be either asc or desc
+    pub direction: SortDirection,
+    /// A null order that describes the order of null values when sorted.
+    pub null_order: NullOrder,
+}
+
+impl Display for SortField {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{} {} {} {}",
+            self.source_id, self.transform, self.direction, self.null_order
+        )
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "kebab-case")]
+pub struct SortOrder {
+    /// Identifier for SortOrder, order_id `0` is no sort order.
+    #[serde(default)]
+    pub order_id: i32,
+    /// Details of the sort
+    #[serde(default)]
+    pub fields: Vec<SortField>,
+}
+
+impl SortOrder {
+    pub fn unsorted_order() -> SortOrder {
+        SortOrder {
+            order_id: 0,
+            fields: vec![],
+        }
+    }
+
+    pub fn is_unsorted(&self) -> bool {
+        self.order_id == 0 || self.fields.is_empty()
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/statistics.rs b/crates/sail-iceberg/src/spec/statistics.rs
new file mode 100644
index 0000000000..d184e76079
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/statistics.rs
@@ -0,0 +1,46 @@
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "kebab-case")]
+pub struct StatisticsFile {
+    /// The snapshot id of the statistics file.
+    pub snapshot_id: i64,
+    /// Path of the statistics file
+    pub statistics_path: String,
+    /// File size in bytes
+    pub file_size_in_bytes: i64,
+    /// File footer size in bytes
+    pub file_footer_size_in_bytes: i64,
+    /// Base64-encoded implementation-specific key metadata for encryption.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub key_metadata: Option<String>,
+    /// Blob metadata
+    pub blob_metadata: Vec<BlobMetadata>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "kebab-case")]
+pub struct BlobMetadata {
+    /// Type of the blob.
+    pub r#type: String,
+    /// Snapshot id of the blob.
+    pub snapshot_id: i64,
+    /// Sequence number of the blob.
+    pub sequence_number: i64,
+    /// Fields of the blob.
+    pub fields: Vec<i32>,
+    /// Properties of the blob.
+    #[serde(default, skip_serializing_if = "std::collections::HashMap::is_empty")]
+    pub properties: std::collections::HashMap<String, String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "kebab-case")]
+pub struct PartitionStatisticsFile {
+    /// The snapshot id of the statistics file.
+    pub snapshot_id: i64,
+    /// Path of the statistics file
+    pub statistics_path: String,
+    /// File size in bytes
+    pub file_size_in_bytes: i64,
+}
diff --git a/crates/sail-iceberg/src/spec/table_metadata.rs b/crates/sail-iceberg/src/spec/table_metadata.rs
index e1ebac60d9..ef27068393 100644
--- a/crates/sail-iceberg/src/spec/table_metadata.rs
+++ b/crates/sail-iceberg/src/spec/table_metadata.rs
@@ -1,10 +1,12 @@
 use std::collections::HashMap;
 
 use serde::{Deserialize, Serialize};
-use serde_json::Value as JsonValue;
 use uuid::Uuid;
 
-use super::{FormatVersion, PartitionSpec, Schema, Snapshot};
+use super::{
+    FormatVersion, PartitionSpec, PartitionStatisticsFile, Schema, Snapshot, SnapshotReference,
+    SortOrder, StatisticsFile,
+};
 
 /// Iceberg table metadata
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -52,19 +54,19 @@ pub struct TableMetadata {
     pub metadata_log: Vec<MetadataLog>,
     /// Sort orders for the table
     #[serde(default, skip_serializing_if = "Vec::is_empty")]
-    pub sort_orders: Vec<JsonValue>,
+    pub sort_orders: Vec<SortOrder>,
     /// Default sort order ID
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub default_sort_order_id: Option<i32>,
     /// Named references to snapshots
     #[serde(default, skip_serializing_if = "HashMap::is_empty")]
-    pub refs: HashMap<String, JsonValue>,
+    pub refs: HashMap<String, SnapshotReference>,
     /// Statistics files
     #[serde(default, skip_serializing_if = "Vec::is_empty")]
-    pub statistics: Vec<JsonValue>,
+    pub statistics: Vec<StatisticsFile>,
     /// Partition statistics files
     #[serde(default, skip_serializing_if = "Vec::is_empty")]
-    pub partition_statistics: Vec<JsonValue>,
+    pub partition_statistics: Vec<PartitionStatisticsFile>,
 }
 
 /// Snapshot log entry
diff --git a/crates/sail-iceberg/src/spec/values.rs b/crates/sail-iceberg/src/spec/values.rs
index 140cc35ae1..e8acd48020 100644
--- a/crates/sail-iceberg/src/spec/values.rs
+++ b/crates/sail-iceberg/src/spec/values.rs
@@ -27,6 +27,22 @@ pub enum PrimitiveLiteral {
     Binary(Vec<u8>),
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+/// Typed single-value used for lower/upper bounds
+pub struct Datum {
+    /// Primitive data type of the datum
+    pub r#type: crate::spec::PrimitiveType,
+    /// Primitive literal value
+    pub literal: PrimitiveLiteral,
+}
+
+impl Datum {
+    pub fn new(r#type: crate::spec::PrimitiveType, literal: PrimitiveLiteral) -> Self {
+        Self { r#type, literal }
+    }
+}
+
 impl Literal {
     // TODO: Type-aware JSON conversion
     pub fn try_from_json(

From fe6ae7311793968331bbcff5cdf51c5e3625a23e Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Thu, 9 Oct 2025 11:43:09 +0800
Subject: [PATCH 04/32] update

---
 Cargo.lock                     | 11 +----------
 crates/sail-iceberg/Cargo.toml | 16 ++++++----------
 2 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d2cf99cf1b..4dd1864832 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5659,33 +5659,24 @@ dependencies = [
 
 [[package]]
 name = "sail-iceberg"
-version = "0.3.6"
+version = "0.3.7"
 dependencies = [
  "apache-avro",
  "arrow-schema",
  "async-trait",
- "base64 0.22.1",
- "bytes",
  "chrono",
  "datafusion",
  "datafusion-common",
- "datafusion-physical-expr",
- "datafusion-physical-expr-adapter",
  "futures",
- "indexmap",
- "itertools 0.14.0",
  "log",
  "num-bigint",
  "num-traits",
  "object_store",
  "once_cell",
  "ordered-float 4.6.0",
- "parquet",
- "percent-encoding",
  "sail-common-datafusion",
  "serde",
  "serde_json",
- "tokio",
  "url",
  "uuid",
 ]
diff --git a/crates/sail-iceberg/Cargo.toml b/crates/sail-iceberg/Cargo.toml
index e2820b0f7d..4eb5352482 100644
--- a/crates/sail-iceberg/Cargo.toml
+++ b/crates/sail-iceberg/Cargo.toml
@@ -5,13 +5,10 @@ edition = { workspace = true }
 
 [dependencies]
 sail-common-datafusion = { path = "../sail-common-datafusion" }
-# Delta Lake
 
 # DataFusion dependencies
 datafusion = { workspace = true }
 datafusion-common = { workspace = true }
-datafusion-physical-expr-adapter = { workspace = true }
-datafusion-physical-expr = { workspace = true }
 
 # Arrow dependencies
 arrow-schema = { workspace = true }
@@ -24,20 +21,19 @@ serde_json = { workspace = true }
 url = { workspace = true }
 futures = { workspace = true }
 serde = { workspace = true, features = ["derive"] }
-tokio = { workspace = true }
 uuid = { workspace = true }
-parquet = { workspace = true }
-bytes = { workspace = true }
-indexmap = { workspace = true }
+# parquet = { workspace = true }
+# bytes = { workspace = true }
+# indexmap = { workspace = true }
 log = { workspace = true }
-itertools = { workspace = true }
-percent-encoding = { workspace = true }
+# itertools = { workspace = true }
+# percent-encoding = { workspace = true }
 once_cell = { workspace = true }
 ordered-float = { workspace = true }
 apache-avro = { workspace = true }
 num-bigint = { workspace = true }
 num-traits = { workspace = true }
-base64 = { workspace = true }
+# base64 = { workspace = true }
 
 [lints]
 workspace = true

From e0800ca6432488aaead11cf1dd5572bd9a4a511a Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Thu, 9 Oct 2025 11:54:00 +0800
Subject: [PATCH 05/32] update

---
 Cargo.lock | 6 +++---
 Cargo.toml | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4dd1864832..e871ae1e8b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4403,9 +4403,9 @@ dependencies = [
 
 [[package]]
 name = "ordered-float"
-version = "4.6.0"
+version = "5.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951"
+checksum = "7f4779c6901a562440c3786d08192c6fbda7c1c2060edd10006b05ee35d10f2d"
 dependencies = [
  "num-traits",
  "rand 0.8.5",
@@ -5673,7 +5673,7 @@ dependencies = [
  "num-traits",
  "object_store",
  "once_cell",
- "ordered-float 4.6.0",
+ "ordered-float 5.1.0",
  "sail-common-datafusion",
  "serde",
  "serde_json",
diff --git a/Cargo.toml b/Cargo.toml
index 8c7e43ba01..f9d4a97ee8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -102,7 +102,7 @@ moka = { version = "0.12.11", features = ["sync"] }
 bytes = "1.10.1"
 indexmap = "2.11.4"
 pin-project-lite = "0.2.16"
-ordered-float = { version = "4.5.0", features = ["serde"] }
+ordered-float = { version = "5.1.0", features = ["serde"] }
 apache-avro = { version = "0.20.0" }
 
 ######

From 6cd4008bd27f1512bebe1b33ee110fa28cf358eb Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Thu, 9 Oct 2025 12:22:02 +0800
Subject: [PATCH 06/32] update

---
 pyproject.toml | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3bb045e7fb..1d0e8edf60 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,8 @@ test = [
     "duckdb>=1.0,<2",
     "pytest>=8.4,<9",
     "pillow>=10.3.0",
-    "pyiceberg[sql-sqlite]>=0.8,<1",
+    "pyiceberg[sql-sqlite]==0.10.0",
+    "pydantic>=2.7,<2.8",
 ]
 mcp = [
     "mcp>=1.0.0,<2",
@@ -77,7 +78,8 @@ dependencies = [
     "mcp>=1.0,<2",
     "boto3>=1.38,<2",
     "pillow>=10.3.0",
-    "pyiceberg[sql-sqlite]>=0.8,<1",
+    "pyiceberg[sql-sqlite]==0.10.0",
+  "pydantic>=2.7,<2.8",
 ]
 path = ".venvs/default"
 
@@ -116,7 +118,8 @@ dependencies = [
     "pytest>=8.4,<9",
     "duckdb>=1.1,<2",
     "pillow>=10.3.0",
-    "pyiceberg[sql-sqlite]>=0.8,<1",
+    "pyiceberg[sql-sqlite]==0.10.0",
+    "pydantic>=2.7,<2.8",
 ]
 
 [[tool.hatch.envs.test.matrix]]
@@ -143,7 +146,8 @@ dependencies = [
     "pytest-xdist>=3.7,<4",
     "pytest-timeout>=2.4,<3",
     "pytest-reportlog>=0.4,<0.5",
-    "pyiceberg[sql-sqlite]>=0.8,<1",
+    "pyiceberg[sql-sqlite]==0.10.0",
+    "pydantic>=2.7,<2.8",
 ]
 
 [[tool.hatch.envs.test-spark.matrix]]

From 919b7c71a9e2796a707e1ce736469cef81c66e18 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Thu, 9 Oct 2025 15:40:48 +0800
Subject: [PATCH 07/32] update

---
 .../sail-iceberg/src/datasource/provider.rs   | 631 +-----------------
 crates/sail-iceberg/src/spec/manifest.rs      | 519 +++++++++++++-
 crates/sail-iceberg/src/spec/manifest_list.rs | 252 ++++++-
 3 files changed, 785 insertions(+), 617 deletions(-)

diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
index dc7417bef0..aedbdb492f 100644
--- a/crates/sail-iceberg/src/datasource/provider.rs
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -3,7 +3,6 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 use std::sync::Arc;
 
-use apache_avro::{from_value, Reader as AvroReader};
 use arrow_schema::Schema as ArrowSchema;
 use async_trait::async_trait;
 use datafusion::catalog::memory::DataSourceExec;
@@ -20,13 +19,12 @@ use datafusion::logical_expr::{Expr, LogicalPlan};
 use datafusion::physical_plan::ExecutionPlan;
 use object_store::path::Path as ObjectPath;
 use object_store::ObjectMeta;
-use serde::{Deserialize, Serialize};
 use url::Url;
 
 use crate::arrow_conversion::iceberg_schema_to_arrow;
 use crate::spec::{
-    DataContentType, DataFile, FieldSummary, Literal, ManifestContentType, ManifestFile,
-    ManifestList, ManifestStatus, PrimitiveLiteral, Schema, Snapshot,
+    DataFile, FormatVersion, Literal, Manifest, ManifestContentType, ManifestList, ManifestStatus,
+    PrimitiveLiteral, Schema, Snapshot,
 };
 
 /// Iceberg table provider for DataFusion
@@ -134,28 +132,8 @@ impl IcebergTableProvider {
             manifest_list_data.len()
         );
 
-        self.parse_manifest_list(&manifest_list_data)
-    }
-
-    /// Parse manifest list from Avro bytes
-    fn parse_manifest_list(&self, data: &[u8]) -> DataFusionResult<ManifestList> {
-        log::debug!("[ICEBERG] Parsing manifest list Avro data");
-        let reader = AvroReader::new(data)
-            .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
-
-        let mut manifest_files = Vec::new();
-        for value in reader {
-            let value =
-                value.map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
-            log::trace!("[ICEBERG] Deserializing manifest file entry");
-            let manifest_file: ManifestFileAvro = from_value(&value).map_err(|e| {
-                log::error!("[ICEBERG] Failed to deserialize manifest file: {:?}", e);
-                datafusion::common::DataFusionError::External(Box::new(e))
-            })?;
-            manifest_files.push(manifest_file.into());
-        }
-
-        Ok(ManifestList::new(manifest_files))
+        ManifestList::parse_with_version(&manifest_list_data, FormatVersion::V2)
+            .map_err(datafusion::common::DataFusionError::Execution)
     }
 
     /// Load data files from manifests
@@ -195,26 +173,22 @@ impl IcebergTableProvider {
 
             log::debug!("[ICEBERG] Read {} bytes from manifest", manifest_data.len());
 
-            let manifest_entries = self.parse_manifest(&manifest_data)?;
+            let manifest = Manifest::parse_avro(&manifest_data)
+                .map_err(datafusion::common::DataFusionError::Execution)?;
 
             // Get partition_spec_id from manifest file
             let partition_spec_id = manifest_file.partition_spec_id;
 
-            for entry in manifest_entries {
-                // Only include added and existing files, skip deleted files
-                let status = match entry.status {
-                    0 => ManifestStatus::Existing,
-                    1 => ManifestStatus::Added,
-                    2 => ManifestStatus::Deleted,
-                    _ => ManifestStatus::Existing,
-                };
-
-                if matches!(status, ManifestStatus::Added | ManifestStatus::Existing) {
-                    // Convert DataFileAvro to DataFile with schema and partition_spec_id
-                    let data_file = entry
-                        .data_file
-                        .into_data_file(&self.schema, partition_spec_id);
-                    data_files.push(data_file);
+            for entry_ref in manifest.entries() {
+                let entry = entry_ref.as_ref();
+                if matches!(
+                    entry.status,
+                    ManifestStatus::Added | ManifestStatus::Existing
+                ) {
+                    let mut df = entry.data_file.clone();
+                    // overwrite partition_spec_id from manifest list file
+                    df.partition_spec_id = partition_spec_id;
+                    data_files.push(df);
                 }
             }
         }
@@ -222,42 +196,6 @@ impl IcebergTableProvider {
         Ok(data_files)
     }
 
-    /// Parse manifest from Avro bytes
-    fn parse_manifest(&self, data: &[u8]) -> DataFusionResult<Vec<ManifestEntryAvro>> {
-        log::debug!("[ICEBERG] Parsing manifest Avro data");
-        let reader = AvroReader::new(data)
-            .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
-
-        let mut entries = Vec::new();
-        for value in reader {
-            let value =
-                value.map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
-            log::trace!("[ICEBERG] Deserializing data file entry");
-            let mut entry: ManifestEntryAvro = from_value(&value).map_err(|e| {
-                log::error!("[ICEBERG] Failed to deserialize data file entry: {:?}", e);
-                datafusion::common::DataFusionError::External(Box::new(e))
-            })?;
-
-            // Extract map fields from raw Avro value
-            if let apache_avro::types::Value::Record(fields) = &value {
-                for (field_name, field_value) in fields {
-                    if field_name == "data_file" {
-                        if let apache_avro::types::Value::Record(data_file_fields) = field_value {
-                            entry
-                                .data_file
-                                .extract_map_fields_from_avro(data_file_fields);
-                        }
-                    }
-                }
-            }
-
-            entries.push(entry);
-        }
-
-        log::debug!("[ICEBERG] Parsed {} entries from manifest", entries.len());
-        Ok(entries)
-    }
-
     /// Create partitioned files for DataFusion from Iceberg data files
     fn create_partitioned_files(
         &self,
@@ -399,14 +337,21 @@ impl IcebergTableProvider {
                 let min_value = data_file
                     .lower_bounds()
                     .get(&field_id)
-                    .map(|literal| self.literal_to_scalar_value(literal))
+                    .map(|datum| {
+                        // convert Datum -> Literal for existing scalar conversion
+                        let lit = Literal::Primitive(datum.literal.clone());
+                        self.literal_to_scalar_value(&lit)
+                    })
                     .map(Precision::Exact)
                     .unwrap_or(Precision::Absent);
 
                 let max_value = data_file
                     .upper_bounds()
                     .get(&field_id)
-                    .map(|literal| self.literal_to_scalar_value(literal))
+                    .map(|datum| {
+                        let lit = Literal::Primitive(datum.literal.clone());
+                        self.literal_to_scalar_value(&lit)
+                    })
                     .map(Precision::Exact)
                     .unwrap_or(Precision::Absent);
 
@@ -428,376 +373,6 @@ impl IcebergTableProvider {
     }
 }
 
-/// Avro representation of ManifestFile for deserialization
-#[derive(Debug, Serialize, Deserialize)]
-struct ManifestFileAvro {
-    #[serde(rename = "manifest_path")]
-    manifest_path: String,
-    #[serde(rename = "manifest_length")]
-    manifest_length: i64,
-    #[serde(rename = "partition_spec_id")]
-    partition_spec_id: i32,
-    #[serde(rename = "content")]
-    content: i32,
-    #[serde(rename = "sequence_number")]
-    sequence_number: i64,
-    #[serde(rename = "min_sequence_number")]
-    min_sequence_number: i64,
-    #[serde(rename = "added_snapshot_id")]
-    added_snapshot_id: i64,
-    #[serde(rename = "added_files_count")]
-    added_files_count: i32,
-    #[serde(rename = "existing_files_count")]
-    existing_files_count: i32,
-    #[serde(rename = "deleted_files_count")]
-    deleted_files_count: i32,
-    #[serde(rename = "added_rows_count")]
-    added_rows_count: i64,
-    #[serde(rename = "existing_rows_count")]
-    existing_rows_count: i64,
-    #[serde(rename = "deleted_rows_count")]
-    deleted_rows_count: i64,
-    #[serde(rename = "partitions")]
-    partitions: Option<Vec<FieldSummaryAvro>>,
-    #[serde(rename = "key_metadata")]
-    key_metadata: Option<Vec<u8>>,
-}
-
-#[derive(Debug, Serialize, Deserialize)]
-struct FieldSummaryAvro {
-    #[serde(rename = "contains_null")]
-    contains_null: bool,
-    #[serde(rename = "contains_nan")]
-    contains_nan: Option<bool>,
-    #[serde(rename = "lower_bound")]
-    lower_bound: Option<Vec<u8>>,
-    #[serde(rename = "upper_bound")]
-    upper_bound: Option<Vec<u8>>,
-}
-
-impl From<ManifestFileAvro> for ManifestFile {
-    fn from(avro: ManifestFileAvro) -> Self {
-        let content = match avro.content {
-            0 => ManifestContentType::Data,
-            1 => ManifestContentType::Deletes,
-            _ => ManifestContentType::Data,
-        };
-
-        let partitions = avro.partitions.map(|summaries| {
-            summaries
-                .into_iter()
-                .map(|summary| {
-                    let lower_bound = summary
-                        .lower_bound
-                        .and_then(|bytes| String::from_utf8(bytes).ok())
-                        .map(|s| Literal::Primitive(PrimitiveLiteral::String(s)));
-
-                    let upper_bound = summary
-                        .upper_bound
-                        .and_then(|bytes| String::from_utf8(bytes).ok())
-                        .map(|s| Literal::Primitive(PrimitiveLiteral::String(s)));
-
-                    let mut field_summary = FieldSummary::new(summary.contains_null);
-                    if let Some(contains_nan) = summary.contains_nan {
-                        field_summary = field_summary.with_contains_nan(contains_nan);
-                    }
-                    if let Some(lower) = lower_bound {
-                        field_summary = field_summary.with_lower_bound(lower);
-                    }
-                    if let Some(upper) = upper_bound {
-                        field_summary = field_summary.with_upper_bound(upper);
-                    }
-                    field_summary
-                })
-                .collect()
-        });
-
-        ManifestFile {
-            manifest_path: avro.manifest_path,
-            manifest_length: avro.manifest_length,
-            partition_spec_id: avro.partition_spec_id,
-            content,
-            sequence_number: avro.sequence_number,
-            min_sequence_number: avro.min_sequence_number,
-            added_snapshot_id: avro.added_snapshot_id,
-            added_files_count: Some(avro.added_files_count),
-            existing_files_count: Some(avro.existing_files_count),
-            deleted_files_count: Some(avro.deleted_files_count),
-            added_rows_count: Some(avro.added_rows_count),
-            existing_rows_count: Some(avro.existing_rows_count),
-            deleted_rows_count: Some(avro.deleted_rows_count),
-            partitions,
-            key_metadata: avro.key_metadata,
-        }
-    }
-}
-
-/// Parse Avro map format (array of {key, value} objects) to HashMap for i64 values
-fn parse_i64_map_from_avro(values: &Option<apache_avro::types::Value>) -> HashMap<i32, i64> {
-    use apache_avro::types::Value;
-
-    let mut map = HashMap::new();
-
-    let vec_opt = if let Some(Value::Union(_, boxed)) = values {
-        if let Value::Array(vec) = boxed.as_ref() {
-            Some(vec)
-        } else {
-            None
-        }
-    } else if let Some(Value::Array(vec)) = values {
-        Some(vec)
-    } else {
-        None
-    };
-
-    if let Some(vec) = vec_opt {
-        for item in vec {
-            if let Value::Record(fields) = item {
-                let mut key_opt = None;
-                let mut value_opt = None;
-
-                for (field_name, field_value) in fields {
-                    match field_name.as_str() {
-                        "key" => {
-                            if let Value::Int(k) = field_value {
-                                key_opt = Some(*k);
-                            }
-                        }
-                        "value" => {
-                            if let Value::Long(v) = field_value {
-                                value_opt = Some(*v);
-                            }
-                        }
-                        _ => {}
-                    }
-                }
-
-                if let (Some(key), Some(value)) = (key_opt, value_opt) {
-                    map.insert(key, value);
-                }
-            }
-        }
-    }
-
-    map
-}
-
-/// Parse Avro map format for byte arrays from Avro Values
-fn parse_bytes_map_from_avro(
-    values: &Option<apache_avro::types::Value>,
-) -> Option<HashMap<i32, Vec<u8>>> {
-    use apache_avro::types::Value;
-
-    if let Some(Value::Union(_, boxed)) = values {
-        if let Value::Array(vec) = boxed.as_ref() {
-            let mut map = HashMap::new();
-            for item in vec {
-                if let Value::Record(fields) = item {
-                    let mut key_opt = None;
-                    let mut value_opt = None;
-
-                    for (field_name, field_value) in fields {
-                        match field_name.as_str() {
-                            "key" => {
-                                if let Value::Int(k) = field_value {
-                                    key_opt = Some(*k);
-                                }
-                            }
-                            "value" => {
-                                if let Value::Bytes(b) = field_value {
-                                    value_opt = Some(b.clone());
-                                }
-                            }
-                            _ => {}
-                        }
-                    }
-
-                    if let (Some(key), Some(value)) = (key_opt, value_opt) {
-                        map.insert(key, value);
-                    }
-                }
-            }
-            return Some(map);
-        }
-    } else if let Some(Value::Array(vec)) = values {
-        let mut map = HashMap::new();
-        for item in vec {
-            if let Value::Record(fields) = item {
-                let mut key_opt = None;
-                let mut value_opt = None;
-
-                for (field_name, field_value) in fields {
-                    match field_name.as_str() {
-                        "key" => {
-                            if let Value::Int(k) = field_value {
-                                key_opt = Some(*k);
-                            }
-                        }
-                        "value" => {
-                            if let Value::Bytes(b) = field_value {
-                                value_opt = Some(b.clone());
-                            }
-                        }
-                        _ => {}
-                    }
-                }
-
-                if let (Some(key), Some(value)) = (key_opt, value_opt) {
-                    map.insert(key, value);
-                }
-            }
-        }
-        return Some(map);
-    }
-
-    None
-}
-
-/// Avro representation of ManifestEntry for deserialization
-#[derive(Debug, Serialize, Deserialize)]
-struct ManifestEntryAvro {
-    #[serde(rename = "status")]
-    status: i32,
-    #[serde(rename = "snapshot_id")]
-    snapshot_id: Option<i64>,
-    #[serde(rename = "sequence_number")]
-    sequence_number: Option<i64>,
-    #[serde(rename = "file_sequence_number")]
-    file_sequence_number: Option<i64>,
-    #[serde(rename = "data_file")]
-    data_file: DataFileAvro,
-}
-
-#[derive(Debug, Serialize, Deserialize)]
-struct DataFileAvro {
-    #[serde(rename = "content", default)]
-    content: i32,
-    #[serde(rename = "file_path")]
-    file_path: String,
-    #[serde(rename = "file_format")]
-    file_format: String,
-    #[serde(rename = "partition")]
-    partition: serde_json::Value,
-    #[serde(rename = "record_count")]
-    record_count: i64,
-    #[serde(rename = "file_size_in_bytes")]
-    file_size_in_bytes: i64,
-    #[serde(skip)]
-    column_sizes: Option<apache_avro::types::Value>,
-    #[serde(skip)]
-    value_counts: Option<apache_avro::types::Value>,
-    #[serde(skip)]
-    null_value_counts: Option<apache_avro::types::Value>,
-    #[serde(skip)]
-    nan_value_counts: Option<apache_avro::types::Value>,
-    #[serde(skip)]
-    lower_bounds: Option<apache_avro::types::Value>,
-    #[serde(skip)]
-    upper_bounds: Option<apache_avro::types::Value>,
-    #[serde(rename = "key_metadata")]
-    key_metadata: Option<Vec<u8>>,
-    #[serde(rename = "split_offsets")]
-    split_offsets: Option<Vec<i64>>,
-    #[serde(rename = "equality_ids")]
-    equality_ids: Option<Vec<i64>>,
-    #[serde(rename = "sort_order_id")]
-    sort_order_id: Option<i32>,
-}
-
-impl DataFileAvro {
-    /// Extract map fields from raw Avro record fields
-    fn extract_map_fields_from_avro(&mut self, fields: &[(String, apache_avro::types::Value)]) {
-        for (field_name, field_value) in fields {
-            match field_name.as_str() {
-                "column_sizes" => self.column_sizes = Some(field_value.clone()),
-                "value_counts" => self.value_counts = Some(field_value.clone()),
-                "null_value_counts" => self.null_value_counts = Some(field_value.clone()),
-                "nan_value_counts" => self.nan_value_counts = Some(field_value.clone()),
-                "lower_bounds" => self.lower_bounds = Some(field_value.clone()),
-                "upper_bounds" => self.upper_bounds = Some(field_value.clone()),
-                _ => {}
-            }
-        }
-    }
-
-    /// Convert DataFileAvro to DataFile with schema context for proper bound parsing
-    fn into_data_file(self, schema: &Schema, partition_spec_id: i32) -> DataFile {
-        let content = match self.content {
-            0 => DataContentType::Data,
-            1 => DataContentType::PositionDeletes,
-            2 => DataContentType::EqualityDeletes,
-            _ => DataContentType::Data,
-        };
-
-        let file_format = match self.file_format.to_uppercase().as_str() {
-            "PARQUET" => crate::spec::DataFileFormat::Parquet,
-            "AVRO" => crate::spec::DataFileFormat::Avro,
-            "ORC" => crate::spec::DataFileFormat::Orc,
-            _ => crate::spec::DataFileFormat::Parquet, // Default
-        };
-
-        // Parse partition values from JSON
-        let partition = parse_partition_values(Some(&self.partition));
-
-        // Parse Avro map arrays (array of {key, value} records)
-        let column_sizes = parse_i64_map_from_avro(&self.column_sizes)
-            .into_iter()
-            .map(|(k, v)| (k, v as u64))
-            .collect();
-
-        let value_counts = parse_i64_map_from_avro(&self.value_counts)
-            .into_iter()
-            .map(|(k, v)| (k, v as u64))
-            .collect();
-
-        let null_value_counts = parse_i64_map_from_avro(&self.null_value_counts)
-            .into_iter()
-            .map(|(k, v)| (k, v as u64))
-            .collect();
-
-        let nan_value_counts = parse_i64_map_from_avro(&self.nan_value_counts)
-            .into_iter()
-            .map(|(k, v)| (k, v as u64))
-            .collect();
-
-        // Parse bounds from binary data using schema for proper type conversion
-        let lower_bounds_raw = parse_bytes_map_from_avro(&self.lower_bounds);
-        let upper_bounds_raw = parse_bytes_map_from_avro(&self.upper_bounds);
-        let lower_bounds = parse_bounds_from_binary(lower_bounds_raw.as_ref(), schema);
-        let upper_bounds = parse_bounds_from_binary(upper_bounds_raw.as_ref(), schema);
-
-        DataFile {
-            content,
-            file_path: self.file_path,
-            file_format,
-            partition,
-            record_count: self.record_count as u64,
-            file_size_in_bytes: self.file_size_in_bytes as u64,
-            column_sizes,
-            value_counts,
-            null_value_counts,
-            nan_value_counts,
-            lower_bounds,
-            upper_bounds,
-            block_size_in_bytes: None,
-            key_metadata: self.key_metadata,
-            split_offsets: self.split_offsets.unwrap_or_default(),
-            equality_ids: self
-                .equality_ids
-                .unwrap_or_default()
-                .into_iter()
-                .map(|v| v as i32)
-                .collect(),
-            sort_order_id: self.sort_order_id,
-            first_row_id: None,
-            partition_spec_id,
-            referenced_data_file: None,
-            content_offset: None,
-            content_size_in_bytes: None,
-        }
-    }
-}
-
 #[async_trait]
 impl TableProvider for IcebergTableProvider {
     fn as_any(&self) -> &dyn Any {
@@ -889,159 +464,3 @@ impl TableProvider for IcebergTableProvider {
         Ok(DataSourceExec::from_data_source(file_scan_config))
     }
 }
-
-/// Parse partition values from JSON
-fn parse_partition_values(partition_json: Option<&serde_json::Value>) -> Vec<Option<Literal>> {
-    match partition_json {
-        Some(serde_json::Value::Array(values)) => values
-            .iter()
-            .map(|value| match value {
-                serde_json::Value::Null => None,
-                serde_json::Value::Bool(b) => {
-                    Some(Literal::Primitive(PrimitiveLiteral::Boolean(*b)))
-                }
-                serde_json::Value::Number(n) => {
-                    if let Some(i) = n.as_i64() {
-                        if i >= i32::MIN as i64 && i <= i32::MAX as i64 {
-                            Some(Literal::Primitive(PrimitiveLiteral::Int(i as i32)))
-                        } else {
-                            Some(Literal::Primitive(PrimitiveLiteral::Long(i)))
-                        }
-                    } else {
-                        n.as_f64().map(|f| {
-                            Literal::Primitive(PrimitiveLiteral::Double(
-                                ordered_float::OrderedFloat(f),
-                            ))
-                        })
-                    }
-                }
-                serde_json::Value::String(s) => {
-                    Some(Literal::Primitive(PrimitiveLiteral::String(s.clone())))
-                }
-                _ => None,
-            })
-            .collect(),
-        Some(serde_json::Value::Object(_)) => {
-            vec![None]
-        }
-        _ => Vec::new(),
-    }
-}
-
-/// Parse bounds from binary data using schema field types
-fn parse_bounds_from_binary(
-    bounds_data: Option<&HashMap<i32, Vec<u8>>>,
-    schema: &Schema,
-) -> HashMap<i32, Literal> {
-    use crate::spec::Type;
-
-    let mut bounds = HashMap::new();
-
-    if let Some(data) = bounds_data {
-        for (field_id, binary_data) in data {
-            // Find the field in schema to get its type
-            if let Some(field) = schema.field_by_id(*field_id) {
-                let field_type = field.field_type.as_ref();
-
-                // Parse based on primitive type
-                let literal = match field_type {
-                    Type::Primitive(prim_type) => {
-                        parse_primitive_bound(binary_data, prim_type).ok()
-                    }
-                    _ => None,
-                };
-
-                if let Some(lit) = literal {
-                    bounds.insert(*field_id, lit);
-                }
-            } else {
-                // Fallback: if field not found, try to parse as string or binary
-                if let Ok(string_value) = String::from_utf8(binary_data.clone()) {
-                    bounds.insert(
-                        *field_id,
-                        Literal::Primitive(PrimitiveLiteral::String(string_value)),
-                    );
-                } else {
-                    bounds.insert(
-                        *field_id,
-                        Literal::Primitive(PrimitiveLiteral::Binary(binary_data.clone())),
-                    );
-                }
-            }
-        }
-    }
-
-    bounds
-}
-
-/// Parse a primitive bound value from binary data based on its type
-/// Reference: https://iceberg.apache.org/spec/#binary-single-value-serialization
-fn parse_primitive_bound(
-    bytes: &[u8],
-    prim_type: &crate::spec::PrimitiveType,
-) -> Result<Literal, String> {
-    use num_bigint::BigInt;
-    use num_traits::ToPrimitive;
-
-    use crate::spec::PrimitiveType;
-
-    let literal = match prim_type {
-        PrimitiveType::Boolean => {
-            let val = !(bytes.len() == 1 && bytes[0] == 0u8);
-            PrimitiveLiteral::Boolean(val)
-        }
-        PrimitiveType::Int | PrimitiveType::Date => {
-            let val = i32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i32 bytes")?);
-            PrimitiveLiteral::Int(val)
-        }
-        PrimitiveType::Long
-        | PrimitiveType::Time
-        | PrimitiveType::Timestamp
-        | PrimitiveType::Timestamptz
-        | PrimitiveType::TimestampNs
-        | PrimitiveType::TimestamptzNs => {
-            let val = if bytes.len() == 4 {
-                // Handle schema evolution case
-                i32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i32 bytes")?) as i64
-            } else {
-                i64::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i64 bytes")?)
-            };
-            PrimitiveLiteral::Long(val)
-        }
-        PrimitiveType::Float => {
-            let val = f32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f32 bytes")?);
-            PrimitiveLiteral::Float(ordered_float::OrderedFloat(val))
-        }
-        PrimitiveType::Double => {
-            let val = if bytes.len() == 4 {
-                // Handle schema evolution case
-                f32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f32 bytes")?) as f64
-            } else {
-                f64::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f64 bytes")?)
-            };
-            PrimitiveLiteral::Double(ordered_float::OrderedFloat(val))
-        }
-        PrimitiveType::String => {
-            let val = std::str::from_utf8(bytes)
-                .map_err(|_| "Invalid UTF-8")?
-                .to_string();
-            PrimitiveLiteral::String(val)
-        }
-        PrimitiveType::Uuid => {
-            let val = u128::from_be_bytes(bytes.try_into().map_err(|_| "Invalid UUID bytes")?);
-            PrimitiveLiteral::UInt128(val)
-        }
-        PrimitiveType::Fixed(_) | PrimitiveType::Binary => {
-            PrimitiveLiteral::Binary(Vec::from(bytes))
-        }
-        PrimitiveType::Decimal { .. } => {
-            let unscaled_value = BigInt::from_signed_bytes_be(bytes);
-            let val = unscaled_value
-                .to_i128()
-                .ok_or_else(|| format!("Can't convert bytes to i128: {:?}", bytes))?;
-            PrimitiveLiteral::Int128(val)
-        }
-    };
-
-    Ok(Literal::Primitive(literal))
-}
diff --git a/crates/sail-iceberg/src/spec/manifest.rs b/crates/sail-iceberg/src/spec/manifest.rs
index ac8b29e71f..8370bc67be 100644
--- a/crates/sail-iceberg/src/spec/manifest.rs
+++ b/crates/sail-iceberg/src/spec/manifest.rs
@@ -1,11 +1,15 @@
 use std::collections::HashMap;
 use std::sync::Arc;
 
+use apache_avro::types::Value as AvroValue;
+use apache_avro::{from_value as avro_from_value, Reader as AvroReader};
 use serde::{Deserialize, Serialize};
 
+use super::datatypes::PrimitiveType;
+use super::manifest_list::ManifestContentType;
 use super::partition::PartitionSpec;
-use super::schema::SchemaRef;
-use super::values::Literal;
+use super::schema::{SchemaId, SchemaRef};
+use super::values::{Datum, Literal, PrimitiveLiteral};
 
 /// Reference to [`ManifestEntry`].
 pub type ManifestEntryRef = Arc<ManifestEntry>;
@@ -43,6 +47,493 @@ impl Manifest {
         let Self { entries, metadata } = self;
         (entries, metadata)
     }
+
+    /// Parse manifest metadata and entries from bytes of avro file.
+    ///
+    /// TODO: Implement Avro decoding and projection for V1/V2.
+    pub(crate) fn try_from_avro_bytes(
+        bs: &[u8],
+    ) -> Result<(ManifestMetadata, Vec<ManifestEntry>), String> {
+        let reader = AvroReader::new(bs).map_err(|e| format!("Avro read error: {e}"))?;
+
+        // Parse manifest metadata from avro user metadata
+        let meta = reader.user_metadata();
+        let metadata = ManifestMetadata::parse_from_avro_meta(meta)?;
+
+        // Determine partition type to guide value decoding when needed
+        let partition_type = metadata
+            .partition_spec
+            .partition_type(&metadata.schema)
+            .map_err(|e| format!("Partition type error: {e}"))?;
+
+        // For entries, reuse the embedded schema in the Avro file and deserialize per record
+        let reader = AvroReader::new(bs).map_err(|e| format!("Avro read error: {e}"))?;
+        let mut entries = Vec::new();
+        for value in reader {
+            let value = value.map_err(|e| format!("Avro read value error: {e}"))?;
+            let entry_avro: ManifestEntryAvro =
+                avro_from_value(&value).map_err(|e| format!("Avro decode entry error: {e}"))?;
+            let data_file = entry_avro.data_file.into_data_file(
+                &metadata.schema,
+                partition_type.fields().len() as i32,
+                metadata.partition_spec.spec_id(),
+            );
+            let status = match entry_avro.status {
+                1 => ManifestStatus::Added,
+                2 => ManifestStatus::Deleted,
+                _ => ManifestStatus::Existing,
+            };
+            let entry = ManifestEntry::new(
+                status,
+                entry_avro.snapshot_id,
+                entry_avro.sequence_number,
+                entry_avro.file_sequence_number,
+                data_file,
+            );
+            entries.push(entry);
+        }
+
+        Ok((metadata, entries))
+    }
+
+    /// Parse a manifest from bytes of avro file.
+    ///
+    /// TODO: Implement Avro decoding and projection for V1/V2.
+    pub fn parse_avro(bs: &[u8]) -> Result<Self, String> {
+        let (metadata, entries) = Self::try_from_avro_bytes(bs)?;
+        Ok(Manifest::new(metadata, entries))
+    }
+}
+
+impl ManifestMetadata {
+    pub(crate) fn parse_from_avro_meta(
+        meta: &std::collections::HashMap<String, Vec<u8>>,
+    ) -> Result<Self, String> {
+        // schema
+        let schema_bs = meta
+            .get("schema")
+            .ok_or_else(|| "schema is required in manifest metadata but not found".to_string())?;
+        let schema: super::Schema = serde_json::from_slice(schema_bs)
+            .map_err(|e| format!("Fail to parse schema in manifest metadata: {e}"))?;
+        let schema_ref = std::sync::Arc::new(schema);
+
+        // schema-id (optional)
+        let schema_id: i32 = meta
+            .get("schema-id")
+            .and_then(|bs| String::from_utf8(bs.clone()).ok())
+            .and_then(|s| s.parse::<i32>().ok())
+            .unwrap_or(0);
+
+        // partition-spec and id
+        let part_fields_bs = meta.get("partition-spec").ok_or_else(|| {
+            "partition-spec is required in manifest metadata but not found".to_string()
+        })?;
+        let part_fields: Vec<super::partition::PartitionField> =
+            serde_json::from_slice(part_fields_bs)
+                .map_err(|e| format!("Fail to parse partition spec in manifest metadata: {e}"))?;
+        let spec_id: i32 = meta
+            .get("partition-spec-id")
+            .and_then(|bs| String::from_utf8(bs.clone()).ok())
+            .and_then(|s| s.parse::<i32>().ok())
+            .unwrap_or(0);
+        let mut builder = super::partition::PartitionSpec::builder().with_spec_id(spec_id);
+        for f in part_fields {
+            builder = builder.add_field_with_id(f.source_id, f.field_id, f.name, f.transform);
+        }
+        let partition_spec = builder.build();
+
+        // format-version
+        let format_version = meta
+            .get("format-version")
+            .and_then(|bs| serde_json::from_slice::<super::FormatVersion>(bs).ok())
+            .unwrap_or(super::FormatVersion::V1);
+
+        // content
+        let content = meta
+            .get("content")
+            .and_then(|bs| String::from_utf8(bs.clone()).ok())
+            .map(|s| match s.to_ascii_lowercase().as_str() {
+                "deletes" => super::manifest_list::ManifestContentType::Deletes,
+                _ => super::manifest_list::ManifestContentType::Data,
+            })
+            .unwrap_or(super::manifest_list::ManifestContentType::Data);
+
+        Ok(ManifestMetadata::new(
+            schema_ref,
+            schema_id,
+            partition_spec,
+            format_version,
+            content,
+        ))
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct ManifestEntryAvro {
+    #[serde(rename = "status")]
+    status: i32,
+    #[serde(rename = "snapshot_id")]
+    snapshot_id: Option<i64>,
+    #[serde(rename = "sequence_number")]
+    sequence_number: Option<i64>,
+    #[serde(rename = "file_sequence_number")]
+    file_sequence_number: Option<i64>,
+    #[serde(rename = "data_file")]
+    data_file: DataFileAvro,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct DataFileAvro {
+    #[serde(rename = "content", default)]
+    content: i32,
+    #[serde(rename = "file_path")]
+    file_path: String,
+    #[serde(rename = "file_format")]
+    file_format: String,
+    #[serde(rename = "partition")]
+    partition: serde_json::Value,
+    #[serde(rename = "record_count")]
+    record_count: i64,
+    #[serde(rename = "file_size_in_bytes")]
+    file_size_in_bytes: i64,
+    #[serde(skip)]
+    column_sizes: Option<AvroValue>,
+    #[serde(skip)]
+    value_counts: Option<AvroValue>,
+    #[serde(skip)]
+    null_value_counts: Option<AvroValue>,
+    #[serde(skip)]
+    nan_value_counts: Option<AvroValue>,
+    #[serde(skip)]
+    lower_bounds: Option<AvroValue>,
+    #[serde(skip)]
+    upper_bounds: Option<AvroValue>,
+    #[serde(rename = "key_metadata")]
+    key_metadata: Option<Vec<u8>>,
+    #[serde(rename = "split_offsets")]
+    split_offsets: Option<Vec<i64>>,
+    #[serde(rename = "equality_ids")]
+    equality_ids: Option<Vec<i64>>,
+    #[serde(rename = "sort_order_id")]
+    sort_order_id: Option<i32>,
+}
+
+impl DataFileAvro {
+    #[allow(dead_code)]
+    fn extract_map_fields_from_avro(&mut self, fields: &[(String, AvroValue)]) {
+        for (field_name, field_value) in fields {
+            match field_name.as_str() {
+                "column_sizes" => self.column_sizes = Some(field_value.clone()),
+                "value_counts" => self.value_counts = Some(field_value.clone()),
+                "null_value_counts" => self.null_value_counts = Some(field_value.clone()),
+                "nan_value_counts" => self.nan_value_counts = Some(field_value.clone()),
+                "lower_bounds" => self.lower_bounds = Some(field_value.clone()),
+                "upper_bounds" => self.upper_bounds = Some(field_value.clone()),
+                _ => {}
+            }
+        }
+    }
+
+    fn into_data_file(
+        self,
+        schema: &super::Schema,
+        _partition_type_len: i32,
+        partition_spec_id: i32,
+    ) -> DataFile {
+        let content = match self.content {
+            0 => DataContentType::Data,
+            1 => DataContentType::PositionDeletes,
+            2 => DataContentType::EqualityDeletes,
+            _ => DataContentType::Data,
+        };
+
+        let file_format = match self.file_format.to_uppercase().as_str() {
+            "PARQUET" => super::DataFileFormat::Parquet,
+            "AVRO" => super::DataFileFormat::Avro,
+            "ORC" => super::DataFileFormat::Orc,
+            _ => super::DataFileFormat::Parquet,
+        };
+
+        let partition = parse_partition_values(Some(&self.partition));
+
+        let column_sizes = parse_i64_map_from_avro(&self.column_sizes)
+            .into_iter()
+            .map(|(k, v)| (k, v as u64))
+            .collect();
+        let value_counts = parse_i64_map_from_avro(&self.value_counts)
+            .into_iter()
+            .map(|(k, v)| (k, v as u64))
+            .collect();
+        let null_value_counts = parse_i64_map_from_avro(&self.null_value_counts)
+            .into_iter()
+            .map(|(k, v)| (k, v as u64))
+            .collect();
+        let nan_value_counts = parse_i64_map_from_avro(&self.nan_value_counts)
+            .into_iter()
+            .map(|(k, v)| (k, v as u64))
+            .collect();
+
+        let lower_bounds_raw = parse_bytes_map_from_avro(&self.lower_bounds);
+        let upper_bounds_raw = parse_bytes_map_from_avro(&self.upper_bounds);
+        let lower_bounds = parse_bounds_from_binary(lower_bounds_raw.as_ref(), schema);
+        let upper_bounds = parse_bounds_from_binary(upper_bounds_raw.as_ref(), schema);
+
+        DataFile {
+            content,
+            file_path: self.file_path,
+            file_format,
+            partition,
+            record_count: self.record_count as u64,
+            file_size_in_bytes: self.file_size_in_bytes as u64,
+            column_sizes,
+            value_counts,
+            null_value_counts,
+            nan_value_counts,
+            lower_bounds,
+            upper_bounds,
+            block_size_in_bytes: None,
+            key_metadata: self.key_metadata,
+            split_offsets: self.split_offsets.unwrap_or_default(),
+            equality_ids: self
+                .equality_ids
+                .unwrap_or_default()
+                .into_iter()
+                .map(|v| v as i32)
+                .collect(),
+            sort_order_id: self.sort_order_id,
+            first_row_id: None,
+            partition_spec_id,
+            referenced_data_file: None,
+            content_offset: None,
+            content_size_in_bytes: None,
+        }
+    }
+}
+
+fn parse_i64_map_from_avro(values: &Option<AvroValue>) -> std::collections::HashMap<i32, i64> {
+    use apache_avro::types::Value;
+    let mut map = std::collections::HashMap::new();
+    if let Some(Value::Map(obj)) = values {
+        for (k, v) in obj {
+            if let Value::Long(i) = v {
+                map.insert(k.parse::<i32>().unwrap_or(0), *i);
+            }
+        }
+        return map;
+    }
+    if let Some(Value::Array(vec)) = values {
+        for item in vec {
+            if let Value::Record(fields) = item {
+                let mut key_opt = None;
+                let mut value_opt = None;
+                for (name, val) in fields {
+                    match name.as_str() {
+                        "key" => {
+                            if let Value::Int(k) = val {
+                                key_opt = Some(*k);
+                            }
+                        }
+                        "value" => {
+                            if let Value::Long(vl) = val {
+                                value_opt = Some(*vl);
+                            }
+                        }
+                        _ => {}
+                    }
+                }
+                if let (Some(k), Some(v)) = (key_opt, value_opt) {
+                    map.insert(k, v);
+                }
+            }
+        }
+    }
+    map
+}
+
+fn parse_bytes_map_from_avro(
+    values: &Option<AvroValue>,
+) -> Option<std::collections::HashMap<i32, Vec<u8>>> {
+    use apache_avro::types::Value;
+    if let Some(Value::Map(obj)) = values {
+        let mut map = std::collections::HashMap::new();
+        for (k, v) in obj {
+            if let Value::Bytes(b) = v {
+                map.insert(k.parse::<i32>().unwrap_or(0), b.clone());
+            }
+        }
+        return Some(map);
+    }
+    if let Some(Value::Array(vec)) = values {
+        let mut map = std::collections::HashMap::new();
+        for item in vec {
+            if let Value::Record(fields) = item {
+                let mut key_opt = None;
+                let mut value_opt = None;
+                for (name, val) in fields {
+                    match name.as_str() {
+                        "key" => {
+                            if let Value::Int(k) = val {
+                                key_opt = Some(*k);
+                            }
+                        }
+                        "value" => {
+                            if let Value::Bytes(b) = val {
+                                value_opt = Some(b.clone());
+                            }
+                        }
+                        _ => {}
+                    }
+                }
+                if let (Some(k), Some(v)) = (key_opt, value_opt) {
+                    map.insert(k, v);
+                }
+            }
+        }
+        return Some(map);
+    }
+    None
+}
+
+// NOTE: These helpers mirror provider.rs logic, kept local to avoid cross-module deps.
+fn parse_partition_values(json: Option<&serde_json::Value>) -> Vec<Option<Literal>> {
+    match json {
+        Some(serde_json::Value::Array(arr)) => arr
+            .iter()
+            .map(|v| match v {
+                serde_json::Value::Null => None,
+                serde_json::Value::Bool(b) => {
+                    Some(Literal::Primitive(PrimitiveLiteral::Boolean(*b)))
+                }
+                serde_json::Value::Number(n) => {
+                    if let Some(i) = n.as_i64() {
+                        if i >= i32::MIN as i64 && i <= i32::MAX as i64 {
+                            Some(Literal::Primitive(PrimitiveLiteral::Int(i as i32)))
+                        } else {
+                            Some(Literal::Primitive(PrimitiveLiteral::Long(i)))
+                        }
+                    } else {
+                        n.as_f64().map(|f| {
+                            Literal::Primitive(PrimitiveLiteral::Double(
+                                ordered_float::OrderedFloat(f),
+                            ))
+                        })
+                    }
+                }
+                serde_json::Value::String(s) => {
+                    Some(Literal::Primitive(PrimitiveLiteral::String(s.clone())))
+                }
+                _ => None,
+            })
+            .collect(),
+        _ => Vec::new(),
+    }
+}
+
+fn parse_bounds_from_binary(
+    bounds_data: Option<&std::collections::HashMap<i32, Vec<u8>>>,
+    schema: &super::Schema,
+) -> std::collections::HashMap<i32, Datum> {
+    use crate::spec::Type;
+    let mut bounds = std::collections::HashMap::new();
+    if let Some(data) = bounds_data {
+        for (field_id, binary_data) in data {
+            if let Some(field) = schema.field_by_id(*field_id) {
+                let field_type = field.field_type.as_ref();
+                let datum = match field_type {
+                    Type::Primitive(prim_type) => {
+                        parse_primitive_bound(binary_data, prim_type).ok()
+                    }
+                    _ => None,
+                };
+                if let Some(d) = datum {
+                    bounds.insert(*field_id, d);
+                }
+            } else if let Ok(string_value) = String::from_utf8(binary_data.clone()) {
+                bounds.insert(
+                    *field_id,
+                    Datum::new(
+                        PrimitiveType::String,
+                        PrimitiveLiteral::String(string_value),
+                    ),
+                );
+            } else {
+                bounds.insert(
+                    *field_id,
+                    Datum::new(
+                        PrimitiveType::Binary,
+                        PrimitiveLiteral::Binary(binary_data.clone()),
+                    ),
+                );
+            }
+        }
+    }
+    bounds
+}
+
+fn parse_primitive_bound(
+    bytes: &[u8],
+    prim_type: &crate::spec::PrimitiveType,
+) -> Result<Datum, String> {
+    use num_bigint::BigInt;
+    use num_traits::ToPrimitive;
+
+    use crate::spec::PrimitiveType;
+    let literal = match prim_type {
+        PrimitiveType::Boolean => {
+            let val = !(bytes.len() == 1 && bytes[0] == 0u8);
+            PrimitiveLiteral::Boolean(val)
+        }
+        PrimitiveType::Int | PrimitiveType::Date => {
+            let val = i32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i32 bytes")?);
+            PrimitiveLiteral::Int(val)
+        }
+        PrimitiveType::Long
+        | PrimitiveType::Time
+        | PrimitiveType::Timestamp
+        | PrimitiveType::Timestamptz
+        | PrimitiveType::TimestampNs
+        | PrimitiveType::TimestamptzNs => {
+            let val = if bytes.len() == 4 {
+                i32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i32 bytes")?) as i64
+            } else {
+                i64::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i64 bytes")?)
+            };
+            PrimitiveLiteral::Long(val)
+        }
+        PrimitiveType::Float => {
+            let val = f32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f32 bytes")?);
+            PrimitiveLiteral::Float(ordered_float::OrderedFloat(val))
+        }
+        PrimitiveType::Double => {
+            let val = if bytes.len() == 4 {
+                f32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f32 bytes")?) as f64
+            } else {
+                f64::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f64 bytes")?)
+            };
+            PrimitiveLiteral::Double(ordered_float::OrderedFloat(val))
+        }
+        PrimitiveType::String => {
+            let val = std::str::from_utf8(bytes)
+                .map_err(|_| "Invalid UTF-8")?
+                .to_string();
+            PrimitiveLiteral::String(val)
+        }
+        PrimitiveType::Uuid => {
+            let val = u128::from_be_bytes(bytes.try_into().map_err(|_| "Invalid UUID bytes")?);
+            PrimitiveLiteral::UInt128(val)
+        }
+        PrimitiveType::Fixed(_) | PrimitiveType::Binary => {
+            PrimitiveLiteral::Binary(Vec::from(bytes))
+        }
+        PrimitiveType::Decimal { .. } => {
+            let unscaled_value = BigInt::from_signed_bytes_be(bytes);
+            let val = unscaled_value
+                .to_i128()
+                .ok_or_else(|| format!("Can't convert bytes to i128: {:?}", bytes))?;
+            PrimitiveLiteral::Int128(val)
+        }
+    };
+    Ok(Datum::new(prim_type.clone(), literal))
 }
 
 /// Metadata about a manifest file.
@@ -50,23 +541,31 @@ impl Manifest {
 pub struct ManifestMetadata {
     /// The schema of the table when the manifest was written.
     pub schema: SchemaRef,
+    /// ID of the schema used to write the manifest
+    pub schema_id: SchemaId,
     /// The partition spec used to write the manifest.
     pub partition_spec: PartitionSpec,
     /// The format version of the manifest.
     pub format_version: FormatVersion,
+    /// Type of content files tracked by the manifest: data or deletes
+    pub content: ManifestContentType,
 }
 
 impl ManifestMetadata {
     /// Create new manifest metadata.
     pub fn new(
         schema: SchemaRef,
+        schema_id: SchemaId,
         partition_spec: PartitionSpec,
         format_version: FormatVersion,
+        content: ManifestContentType,
     ) -> Self {
         Self {
             schema,
+            schema_id,
             partition_spec,
             format_version,
+            content,
         }
     }
 }
@@ -216,10 +715,10 @@ pub struct DataFile {
     pub nan_value_counts: HashMap<i32, u64>,
     /// Map from column id to lower bound in the column.
     #[serde(skip_serializing_if = "HashMap::is_empty")]
-    pub lower_bounds: HashMap<i32, Literal>,
+    pub lower_bounds: HashMap<i32, Datum>,
     /// Map from column id to upper bound in the column.
     #[serde(skip_serializing_if = "HashMap::is_empty")]
-    pub upper_bounds: HashMap<i32, Literal>,
+    pub upper_bounds: HashMap<i32, Datum>,
     /// Block size in bytes.
     #[serde(skip_serializing_if = "Option::is_none")]
     pub block_size_in_bytes: Option<i64>,
@@ -308,12 +807,12 @@ impl DataFile {
     }
 
     /// Get lower bounds.
-    pub fn lower_bounds(&self) -> &HashMap<i32, Literal> {
+    pub fn lower_bounds(&self) -> &HashMap<i32, Datum> {
         &self.lower_bounds
     }
 
     /// Get upper bounds.
-    pub fn upper_bounds(&self) -> &HashMap<i32, Literal> {
+    pub fn upper_bounds(&self) -> &HashMap<i32, Datum> {
         &self.upper_bounds
     }
 }
@@ -331,8 +830,8 @@ pub struct DataFileBuilder {
     value_counts: HashMap<i32, u64>,
     null_value_counts: HashMap<i32, u64>,
     nan_value_counts: HashMap<i32, u64>,
-    lower_bounds: HashMap<i32, Literal>,
-    upper_bounds: HashMap<i32, Literal>,
+    lower_bounds: HashMap<i32, Datum>,
+    upper_bounds: HashMap<i32, Datum>,
     block_size_in_bytes: Option<i64>,
     key_metadata: Option<Vec<u8>>,
     split_offsets: Vec<i64>,
@@ -435,13 +934,13 @@ impl DataFileBuilder {
     }
 
     /// Add lower bound.
-    pub fn with_lower_bound(mut self, column_id: i32, bound: Literal) -> Self {
+    pub fn with_lower_bound(mut self, column_id: i32, bound: Datum) -> Self {
         self.lower_bounds.insert(column_id, bound);
         self
     }
 
     /// Add upper bound.
-    pub fn with_upper_bound(mut self, column_id: i32, bound: Literal) -> Self {
+    pub fn with_upper_bound(mut self, column_id: i32, bound: Datum) -> Self {
         self.upper_bounds.insert(column_id, bound);
         self
     }
diff --git a/crates/sail-iceberg/src/spec/manifest_list.rs b/crates/sail-iceberg/src/spec/manifest_list.rs
index 12cb84d6de..ef00f13b50 100644
--- a/crates/sail-iceberg/src/spec/manifest_list.rs
+++ b/crates/sail-iceberg/src/spec/manifest_list.rs
@@ -1,6 +1,7 @@
+use apache_avro::{from_value as avro_from_value, Reader as AvroReader};
 use serde::{Deserialize, Serialize};
 
-use super::values::Literal;
+use super::values::{Literal, PrimitiveLiteral};
 
 pub const UNASSIGNED_SEQUENCE_NUMBER: i64 = -1;
 
@@ -38,6 +39,128 @@ impl ManifestList {
     pub fn into_entries(self) -> Vec<ManifestFile> {
         self.entries
     }
+
+    /// Parse manifest list from bytes with a specified version.
+    pub fn parse_with_version(
+        bs: &[u8],
+        _version: super::FormatVersion,
+    ) -> Result<ManifestList, String> {
+        // Decode per-record to avoid array-level serde issues; field aliases/defaults cover V1/V2
+        let reader = AvroReader::new(bs).map_err(|e| format!("Avro read error: {e}"))?;
+        let mut manifest_files = Vec::new();
+        for value in reader {
+            let value = value.map_err(|e| format!("Avro read value error: {e}"))?;
+            let mf_avro: ManifestFileAvro =
+                avro_from_value(&value).map_err(|e| format!("Avro decode error: {e}"))?;
+            manifest_files.push(mf_avro.into());
+        }
+        Ok(ManifestList::new(manifest_files))
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+struct ManifestFileAvro {
+    #[serde(rename = "manifest_path")]
+    manifest_path: String,
+    #[serde(rename = "manifest_length")]
+    manifest_length: i64,
+    #[serde(rename = "partition_spec_id")]
+    partition_spec_id: i32,
+    #[serde(rename = "content")]
+    content: i32,
+    #[serde(rename = "sequence_number")]
+    sequence_number: i64,
+    #[serde(rename = "min_sequence_number")]
+    min_sequence_number: i64,
+    #[serde(rename = "added_snapshot_id")]
+    added_snapshot_id: i64,
+    #[serde(alias = "added_data_files_count", rename = "added_files_count")]
+    added_files_count: i32,
+    #[serde(alias = "existing_data_files_count", rename = "existing_files_count")]
+    existing_files_count: i32,
+    #[serde(alias = "deleted_data_files_count", rename = "deleted_files_count")]
+    deleted_files_count: i32,
+    #[serde(rename = "added_rows_count")]
+    added_rows_count: i64,
+    #[serde(rename = "existing_rows_count")]
+    existing_rows_count: i64,
+    #[serde(rename = "deleted_rows_count")]
+    deleted_rows_count: i64,
+    #[serde(rename = "partitions")]
+    partitions: Option<Vec<FieldSummaryAvro>>,
+    #[serde(rename = "key_metadata")]
+    key_metadata: Option<Vec<u8>>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+struct FieldSummaryAvro {
+    #[serde(rename = "contains_null")]
+    contains_null: bool,
+    #[serde(rename = "contains_nan")]
+    contains_nan: Option<bool>,
+    #[serde(rename = "lower_bound")]
+    lower_bound: Option<Vec<u8>>,
+    #[serde(rename = "upper_bound")]
+    upper_bound: Option<Vec<u8>>,
+}
+
+impl From<ManifestFileAvro> for ManifestFile {
+    fn from(avro: ManifestFileAvro) -> Self {
+        let content = match avro.content {
+            0 => ManifestContentType::Data,
+            1 => ManifestContentType::Deletes,
+            _ => ManifestContentType::Data,
+        };
+
+        let partitions = avro.partitions.map(|summaries| {
+            summaries
+                .into_iter()
+                .map(|summary| {
+                    let lower_bound = summary
+                        .lower_bound
+                        .and_then(|bytes| String::from_utf8(bytes).ok())
+                        .map(|s| Literal::Primitive(PrimitiveLiteral::String(s)));
+
+                    let upper_bound = summary
+                        .upper_bound
+                        .and_then(|bytes| String::from_utf8(bytes).ok())
+                        .map(|s| Literal::Primitive(PrimitiveLiteral::String(s)));
+
+                    let mut field_summary = FieldSummary::new(summary.contains_null);
+                    if let Some(contains_nan) = summary.contains_nan {
+                        field_summary = field_summary.with_contains_nan(contains_nan);
+                    }
+                    if let Some(lower) = lower_bound {
+                        field_summary = field_summary.with_lower_bound(lower);
+                    }
+                    if let Some(upper) = upper_bound {
+                        field_summary = field_summary.with_upper_bound(upper);
+                    }
+                    field_summary
+                })
+                .collect()
+        });
+
+        ManifestFile {
+            manifest_path: avro.manifest_path,
+            manifest_length: avro.manifest_length,
+            partition_spec_id: avro.partition_spec_id,
+            content,
+            sequence_number: avro.sequence_number,
+            min_sequence_number: avro.min_sequence_number,
+            added_snapshot_id: avro.added_snapshot_id,
+            added_files_count: Some(avro.added_files_count),
+            existing_files_count: Some(avro.existing_files_count),
+            deleted_files_count: Some(avro.deleted_files_count),
+            added_rows_count: Some(avro.added_rows_count),
+            existing_rows_count: Some(avro.existing_rows_count),
+            deleted_rows_count: Some(avro.deleted_rows_count),
+            partitions,
+            key_metadata: avro.key_metadata,
+        }
+    }
 }
 
 /// Status of a manifest file in a manifest list.
@@ -316,3 +439,130 @@ impl Default for ManifestFileBuilder {
         Self::new()
     }
 }
+
+pub(super) mod _serde {
+    use serde::{Deserialize, Serialize};
+
+    use super::*;
+
+    #[derive(Debug, Serialize, Deserialize)]
+    #[serde(transparent)]
+    pub(crate) struct ManifestListV1 {
+        entries: Vec<ManifestFileV1>,
+    }
+
+    #[derive(Debug, Serialize, Deserialize)]
+    pub(crate) struct ManifestListV2 {
+        entries: Vec<ManifestFileV2>,
+    }
+
+    #[derive(Debug, Serialize, Deserialize)]
+    pub(super) struct ManifestFileV1 {
+        pub manifest_path: String,
+        pub manifest_length: i64,
+        pub partition_spec_id: i32,
+        pub added_snapshot_id: i64,
+        pub added_data_files_count: Option<i32>,
+        pub existing_data_files_count: Option<i32>,
+        pub deleted_data_files_count: Option<i32>,
+        pub added_rows_count: Option<i64>,
+        pub existing_rows_count: Option<i64>,
+        pub deleted_rows_count: Option<i64>,
+        pub partitions: Option<Vec<FieldSummary>>,
+        pub key_metadata: Option<Vec<u8>>,
+    }
+
+    #[derive(Debug, Serialize, Deserialize)]
+    pub(super) struct ManifestFileV2 {
+        pub manifest_path: String,
+        pub manifest_length: i64,
+        pub partition_spec_id: i32,
+        #[serde(default = "v2_default_content_for_v1")]
+        pub content: i32,
+        #[serde(default = "v2_default_sequence_number_for_v1")]
+        pub sequence_number: i64,
+        #[serde(default = "v2_default_min_sequence_number_for_v1")]
+        pub min_sequence_number: i64,
+        #[serde(alias = "added_data_files_count", alias = "added_files_count")]
+        pub added_files_count: i32,
+        #[serde(alias = "existing_data_files_count", alias = "existing_files_count")]
+        pub existing_files_count: i32,
+        #[serde(alias = "deleted_data_files_count", alias = "deleted_files_count")]
+        pub deleted_files_count: i32,
+        pub added_snapshot_id: i64,
+        pub added_rows_count: i64,
+        pub existing_rows_count: i64,
+        pub deleted_rows_count: i64,
+        pub partitions: Option<Vec<FieldSummary>>,
+        pub key_metadata: Option<Vec<u8>>,
+    }
+
+    const fn v2_default_content_for_v1() -> i32 {
+        super::ManifestContentType::Data as i32
+    }
+    const fn v2_default_sequence_number_for_v1() -> i64 {
+        0
+    }
+    const fn v2_default_min_sequence_number_for_v1() -> i64 {
+        0
+    }
+
+    impl TryFrom<ManifestListV1> for super::ManifestList {
+        type Error = String;
+        fn try_from(v1: ManifestListV1) -> Result<Self, Self::Error> {
+            let entries = v1
+                .entries
+                .into_iter()
+                .map(|e| ManifestFile {
+                    manifest_path: e.manifest_path,
+                    manifest_length: e.manifest_length,
+                    partition_spec_id: e.partition_spec_id,
+                    content: ManifestContentType::Data,
+                    sequence_number: 0,
+                    min_sequence_number: 0,
+                    added_snapshot_id: e.added_snapshot_id,
+                    added_files_count: e.added_data_files_count,
+                    existing_files_count: e.existing_data_files_count,
+                    deleted_files_count: e.deleted_data_files_count,
+                    added_rows_count: e.added_rows_count,
+                    existing_rows_count: e.existing_rows_count,
+                    deleted_rows_count: e.deleted_rows_count,
+                    partitions: e.partitions,
+                    key_metadata: e.key_metadata,
+                })
+                .collect();
+            Ok(super::ManifestList::new(entries))
+        }
+    }
+
+    impl TryFrom<ManifestListV2> for super::ManifestList {
+        type Error = String;
+        fn try_from(v2: ManifestListV2) -> Result<Self, Self::Error> {
+            let entries = v2
+                .entries
+                .into_iter()
+                .map(|e| ManifestFile {
+                    manifest_path: e.manifest_path,
+                    manifest_length: e.manifest_length,
+                    partition_spec_id: e.partition_spec_id,
+                    content: match e.content {
+                        1 => ManifestContentType::Deletes,
+                        _ => ManifestContentType::Data,
+                    },
+                    sequence_number: e.sequence_number,
+                    min_sequence_number: e.min_sequence_number,
+                    added_snapshot_id: e.added_snapshot_id,
+                    added_files_count: Some(e.added_files_count),
+                    existing_files_count: Some(e.existing_files_count),
+                    deleted_files_count: Some(e.deleted_files_count),
+                    added_rows_count: Some(e.added_rows_count),
+                    existing_rows_count: Some(e.existing_rows_count),
+                    deleted_rows_count: Some(e.deleted_rows_count),
+                    partitions: e.partitions,
+                    key_metadata: e.key_metadata,
+                })
+                .collect();
+            Ok(super::ManifestList::new(entries))
+        }
+    }
+}

From e9781c4b77c56a4c75bf321cc182cca16f6621e9 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Fri, 10 Oct 2025 13:42:07 +0800
Subject: [PATCH 08/32] update

---
 crates/sail-iceberg/src/spec/encrypted_key.rs |    4 +
 crates/sail-iceberg/src/spec/format.rs        |   40 +
 crates/sail-iceberg/src/spec/manifest.rs      | 1008 -----------------
 .../sail-iceberg/src/spec/manifest/_serde.rs  |  138 +++
 .../src/spec/manifest/data_file.rs            |  105 ++
 .../sail-iceberg/src/spec/manifest/entry.rs   |   43 +
 .../src/spec/manifest/metadata.rs             |   94 ++
 crates/sail-iceberg/src/spec/manifest/mod.rs  |  331 ++++++
 .../sail-iceberg/src/spec/manifest/writer.rs  |    5 +
 crates/sail-iceberg/src/spec/manifest_list.rs |  281 ++---
 crates/sail-iceberg/src/spec/mod.rs           |   18 +-
 .../{name_mapping.rs => name_mapping/mod.rs}  |    2 -
 crates/sail-iceberg/src/spec/partition.rs     |   13 +
 .../sail-iceberg/src/spec/snapshot_summary.rs |    4 +
 .../sail-iceberg/src/spec/statistic_file.rs   |   46 +
 .../src/spec/table_metadata_builder.rs        |    3 +
 crates/sail-iceberg/src/spec/transform.rs     |   32 +
 crates/sail-iceberg/src/spec/view_metadata.rs |    3 +
 .../src/spec/view_metadata_builder.rs         |    3 +
 crates/sail-iceberg/src/spec/view_version.rs  |    3 +
 20 files changed, 997 insertions(+), 1179 deletions(-)
 create mode 100644 crates/sail-iceberg/src/spec/encrypted_key.rs
 create mode 100644 crates/sail-iceberg/src/spec/format.rs
 delete mode 100644 crates/sail-iceberg/src/spec/manifest.rs
 create mode 100644 crates/sail-iceberg/src/spec/manifest/_serde.rs
 create mode 100644 crates/sail-iceberg/src/spec/manifest/data_file.rs
 create mode 100644 crates/sail-iceberg/src/spec/manifest/entry.rs
 create mode 100644 crates/sail-iceberg/src/spec/manifest/metadata.rs
 create mode 100644 crates/sail-iceberg/src/spec/manifest/mod.rs
 create mode 100644 crates/sail-iceberg/src/spec/manifest/writer.rs
 rename crates/sail-iceberg/src/spec/{name_mapping.rs => name_mapping/mod.rs} (99%)
 create mode 100644 crates/sail-iceberg/src/spec/snapshot_summary.rs
 create mode 100644 crates/sail-iceberg/src/spec/statistic_file.rs
 create mode 100644 crates/sail-iceberg/src/spec/table_metadata_builder.rs
 create mode 100644 crates/sail-iceberg/src/spec/view_metadata.rs
 create mode 100644 crates/sail-iceberg/src/spec/view_metadata_builder.rs
 create mode 100644 crates/sail-iceberg/src/spec/view_version.rs

diff --git a/crates/sail-iceberg/src/spec/encrypted_key.rs b/crates/sail-iceberg/src/spec/encrypted_key.rs
new file mode 100644
index 0000000000..498e72b11a
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/encrypted_key.rs
@@ -0,0 +1,4 @@
+// Awareness stub for non-read path
+#[allow(dead_code)]
+#[derive(Debug, Clone)]
+pub struct EncryptedKey;
diff --git a/crates/sail-iceberg/src/spec/format.rs b/crates/sail-iceberg/src/spec/format.rs
new file mode 100644
index 0000000000..955ddaebed
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/format.rs
@@ -0,0 +1,40 @@
+/// Format version of Iceberg.
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub enum FormatVersion {
+    /// Version 1
+    V1 = 1,
+    /// Version 2
+    V2 = 2,
+}
+
+impl serde::Serialize for FormatVersion {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.serialize_i32(*self as i32)
+    }
+}
+
+impl<'de> serde::Deserialize<'de> for FormatVersion {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let value = i32::deserialize(deserializer)?;
+        match value {
+            1 => Ok(FormatVersion::V1),
+            2 => Ok(FormatVersion::V2),
+            _ => Err(serde::de::Error::custom(format!(
+                "Invalid format version: {}",
+                value
+            ))),
+        }
+    }
+}
+
+impl Default for FormatVersion {
+    fn default() -> Self {
+        Self::V2
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/manifest.rs b/crates/sail-iceberg/src/spec/manifest.rs
deleted file mode 100644
index 8370bc67be..0000000000
--- a/crates/sail-iceberg/src/spec/manifest.rs
+++ /dev/null
@@ -1,1008 +0,0 @@
-use std::collections::HashMap;
-use std::sync::Arc;
-
-use apache_avro::types::Value as AvroValue;
-use apache_avro::{from_value as avro_from_value, Reader as AvroReader};
-use serde::{Deserialize, Serialize};
-
-use super::datatypes::PrimitiveType;
-use super::manifest_list::ManifestContentType;
-use super::partition::PartitionSpec;
-use super::schema::{SchemaId, SchemaRef};
-use super::values::{Datum, Literal, PrimitiveLiteral};
-
-/// Reference to [`ManifestEntry`].
-pub type ManifestEntryRef = Arc<ManifestEntry>;
-
-/// A manifest contains metadata and a list of entries.
-#[derive(Debug, PartialEq, Eq, Clone)]
-pub struct Manifest {
-    /// Metadata about the manifest.
-    pub metadata: ManifestMetadata,
-    /// Entries in the manifest.
-    pub entries: Vec<ManifestEntryRef>,
-}
-
-impl Manifest {
-    /// Create a new manifest.
-    pub fn new(metadata: ManifestMetadata, entries: Vec<ManifestEntry>) -> Self {
-        Self {
-            metadata,
-            entries: entries.into_iter().map(Arc::new).collect(),
-        }
-    }
-
-    /// Get the entries in the manifest.
-    pub fn entries(&self) -> &[ManifestEntryRef] {
-        &self.entries
-    }
-
-    /// Get the metadata of the manifest.
-    pub fn metadata(&self) -> &ManifestMetadata {
-        &self.metadata
-    }
-
-    /// Consume this Manifest, returning its constituent parts
-    pub fn into_parts(self) -> (Vec<ManifestEntryRef>, ManifestMetadata) {
-        let Self { entries, metadata } = self;
-        (entries, metadata)
-    }
-
-    /// Parse manifest metadata and entries from bytes of avro file.
-    ///
-    /// TODO: Implement Avro decoding and projection for V1/V2.
-    pub(crate) fn try_from_avro_bytes(
-        bs: &[u8],
-    ) -> Result<(ManifestMetadata, Vec<ManifestEntry>), String> {
-        let reader = AvroReader::new(bs).map_err(|e| format!("Avro read error: {e}"))?;
-
-        // Parse manifest metadata from avro user metadata
-        let meta = reader.user_metadata();
-        let metadata = ManifestMetadata::parse_from_avro_meta(meta)?;
-
-        // Determine partition type to guide value decoding when needed
-        let partition_type = metadata
-            .partition_spec
-            .partition_type(&metadata.schema)
-            .map_err(|e| format!("Partition type error: {e}"))?;
-
-        // For entries, reuse the embedded schema in the Avro file and deserialize per record
-        let reader = AvroReader::new(bs).map_err(|e| format!("Avro read error: {e}"))?;
-        let mut entries = Vec::new();
-        for value in reader {
-            let value = value.map_err(|e| format!("Avro read value error: {e}"))?;
-            let entry_avro: ManifestEntryAvro =
-                avro_from_value(&value).map_err(|e| format!("Avro decode entry error: {e}"))?;
-            let data_file = entry_avro.data_file.into_data_file(
-                &metadata.schema,
-                partition_type.fields().len() as i32,
-                metadata.partition_spec.spec_id(),
-            );
-            let status = match entry_avro.status {
-                1 => ManifestStatus::Added,
-                2 => ManifestStatus::Deleted,
-                _ => ManifestStatus::Existing,
-            };
-            let entry = ManifestEntry::new(
-                status,
-                entry_avro.snapshot_id,
-                entry_avro.sequence_number,
-                entry_avro.file_sequence_number,
-                data_file,
-            );
-            entries.push(entry);
-        }
-
-        Ok((metadata, entries))
-    }
-
-    /// Parse a manifest from bytes of avro file.
-    ///
-    /// TODO: Implement Avro decoding and projection for V1/V2.
-    pub fn parse_avro(bs: &[u8]) -> Result<Self, String> {
-        let (metadata, entries) = Self::try_from_avro_bytes(bs)?;
-        Ok(Manifest::new(metadata, entries))
-    }
-}
-
-impl ManifestMetadata {
-    pub(crate) fn parse_from_avro_meta(
-        meta: &std::collections::HashMap<String, Vec<u8>>,
-    ) -> Result<Self, String> {
-        // schema
-        let schema_bs = meta
-            .get("schema")
-            .ok_or_else(|| "schema is required in manifest metadata but not found".to_string())?;
-        let schema: super::Schema = serde_json::from_slice(schema_bs)
-            .map_err(|e| format!("Fail to parse schema in manifest metadata: {e}"))?;
-        let schema_ref = std::sync::Arc::new(schema);
-
-        // schema-id (optional)
-        let schema_id: i32 = meta
-            .get("schema-id")
-            .and_then(|bs| String::from_utf8(bs.clone()).ok())
-            .and_then(|s| s.parse::<i32>().ok())
-            .unwrap_or(0);
-
-        // partition-spec and id
-        let part_fields_bs = meta.get("partition-spec").ok_or_else(|| {
-            "partition-spec is required in manifest metadata but not found".to_string()
-        })?;
-        let part_fields: Vec<super::partition::PartitionField> =
-            serde_json::from_slice(part_fields_bs)
-                .map_err(|e| format!("Fail to parse partition spec in manifest metadata: {e}"))?;
-        let spec_id: i32 = meta
-            .get("partition-spec-id")
-            .and_then(|bs| String::from_utf8(bs.clone()).ok())
-            .and_then(|s| s.parse::<i32>().ok())
-            .unwrap_or(0);
-        let mut builder = super::partition::PartitionSpec::builder().with_spec_id(spec_id);
-        for f in part_fields {
-            builder = builder.add_field_with_id(f.source_id, f.field_id, f.name, f.transform);
-        }
-        let partition_spec = builder.build();
-
-        // format-version
-        let format_version = meta
-            .get("format-version")
-            .and_then(|bs| serde_json::from_slice::<super::FormatVersion>(bs).ok())
-            .unwrap_or(super::FormatVersion::V1);
-
-        // content
-        let content = meta
-            .get("content")
-            .and_then(|bs| String::from_utf8(bs.clone()).ok())
-            .map(|s| match s.to_ascii_lowercase().as_str() {
-                "deletes" => super::manifest_list::ManifestContentType::Deletes,
-                _ => super::manifest_list::ManifestContentType::Data,
-            })
-            .unwrap_or(super::manifest_list::ManifestContentType::Data);
-
-        Ok(ManifestMetadata::new(
-            schema_ref,
-            schema_id,
-            partition_spec,
-            format_version,
-            content,
-        ))
-    }
-}
-
-#[derive(Debug, Serialize, Deserialize)]
-struct ManifestEntryAvro {
-    #[serde(rename = "status")]
-    status: i32,
-    #[serde(rename = "snapshot_id")]
-    snapshot_id: Option<i64>,
-    #[serde(rename = "sequence_number")]
-    sequence_number: Option<i64>,
-    #[serde(rename = "file_sequence_number")]
-    file_sequence_number: Option<i64>,
-    #[serde(rename = "data_file")]
-    data_file: DataFileAvro,
-}
-
-#[derive(Debug, Serialize, Deserialize)]
-struct DataFileAvro {
-    #[serde(rename = "content", default)]
-    content: i32,
-    #[serde(rename = "file_path")]
-    file_path: String,
-    #[serde(rename = "file_format")]
-    file_format: String,
-    #[serde(rename = "partition")]
-    partition: serde_json::Value,
-    #[serde(rename = "record_count")]
-    record_count: i64,
-    #[serde(rename = "file_size_in_bytes")]
-    file_size_in_bytes: i64,
-    #[serde(skip)]
-    column_sizes: Option<AvroValue>,
-    #[serde(skip)]
-    value_counts: Option<AvroValue>,
-    #[serde(skip)]
-    null_value_counts: Option<AvroValue>,
-    #[serde(skip)]
-    nan_value_counts: Option<AvroValue>,
-    #[serde(skip)]
-    lower_bounds: Option<AvroValue>,
-    #[serde(skip)]
-    upper_bounds: Option<AvroValue>,
-    #[serde(rename = "key_metadata")]
-    key_metadata: Option<Vec<u8>>,
-    #[serde(rename = "split_offsets")]
-    split_offsets: Option<Vec<i64>>,
-    #[serde(rename = "equality_ids")]
-    equality_ids: Option<Vec<i64>>,
-    #[serde(rename = "sort_order_id")]
-    sort_order_id: Option<i32>,
-}
-
-impl DataFileAvro {
-    #[allow(dead_code)]
-    fn extract_map_fields_from_avro(&mut self, fields: &[(String, AvroValue)]) {
-        for (field_name, field_value) in fields {
-            match field_name.as_str() {
-                "column_sizes" => self.column_sizes = Some(field_value.clone()),
-                "value_counts" => self.value_counts = Some(field_value.clone()),
-                "null_value_counts" => self.null_value_counts = Some(field_value.clone()),
-                "nan_value_counts" => self.nan_value_counts = Some(field_value.clone()),
-                "lower_bounds" => self.lower_bounds = Some(field_value.clone()),
-                "upper_bounds" => self.upper_bounds = Some(field_value.clone()),
-                _ => {}
-            }
-        }
-    }
-
-    fn into_data_file(
-        self,
-        schema: &super::Schema,
-        _partition_type_len: i32,
-        partition_spec_id: i32,
-    ) -> DataFile {
-        let content = match self.content {
-            0 => DataContentType::Data,
-            1 => DataContentType::PositionDeletes,
-            2 => DataContentType::EqualityDeletes,
-            _ => DataContentType::Data,
-        };
-
-        let file_format = match self.file_format.to_uppercase().as_str() {
-            "PARQUET" => super::DataFileFormat::Parquet,
-            "AVRO" => super::DataFileFormat::Avro,
-            "ORC" => super::DataFileFormat::Orc,
-            _ => super::DataFileFormat::Parquet,
-        };
-
-        let partition = parse_partition_values(Some(&self.partition));
-
-        let column_sizes = parse_i64_map_from_avro(&self.column_sizes)
-            .into_iter()
-            .map(|(k, v)| (k, v as u64))
-            .collect();
-        let value_counts = parse_i64_map_from_avro(&self.value_counts)
-            .into_iter()
-            .map(|(k, v)| (k, v as u64))
-            .collect();
-        let null_value_counts = parse_i64_map_from_avro(&self.null_value_counts)
-            .into_iter()
-            .map(|(k, v)| (k, v as u64))
-            .collect();
-        let nan_value_counts = parse_i64_map_from_avro(&self.nan_value_counts)
-            .into_iter()
-            .map(|(k, v)| (k, v as u64))
-            .collect();
-
-        let lower_bounds_raw = parse_bytes_map_from_avro(&self.lower_bounds);
-        let upper_bounds_raw = parse_bytes_map_from_avro(&self.upper_bounds);
-        let lower_bounds = parse_bounds_from_binary(lower_bounds_raw.as_ref(), schema);
-        let upper_bounds = parse_bounds_from_binary(upper_bounds_raw.as_ref(), schema);
-
-        DataFile {
-            content,
-            file_path: self.file_path,
-            file_format,
-            partition,
-            record_count: self.record_count as u64,
-            file_size_in_bytes: self.file_size_in_bytes as u64,
-            column_sizes,
-            value_counts,
-            null_value_counts,
-            nan_value_counts,
-            lower_bounds,
-            upper_bounds,
-            block_size_in_bytes: None,
-            key_metadata: self.key_metadata,
-            split_offsets: self.split_offsets.unwrap_or_default(),
-            equality_ids: self
-                .equality_ids
-                .unwrap_or_default()
-                .into_iter()
-                .map(|v| v as i32)
-                .collect(),
-            sort_order_id: self.sort_order_id,
-            first_row_id: None,
-            partition_spec_id,
-            referenced_data_file: None,
-            content_offset: None,
-            content_size_in_bytes: None,
-        }
-    }
-}
-
-fn parse_i64_map_from_avro(values: &Option<AvroValue>) -> std::collections::HashMap<i32, i64> {
-    use apache_avro::types::Value;
-    let mut map = std::collections::HashMap::new();
-    if let Some(Value::Map(obj)) = values {
-        for (k, v) in obj {
-            if let Value::Long(i) = v {
-                map.insert(k.parse::<i32>().unwrap_or(0), *i);
-            }
-        }
-        return map;
-    }
-    if let Some(Value::Array(vec)) = values {
-        for item in vec {
-            if let Value::Record(fields) = item {
-                let mut key_opt = None;
-                let mut value_opt = None;
-                for (name, val) in fields {
-                    match name.as_str() {
-                        "key" => {
-                            if let Value::Int(k) = val {
-                                key_opt = Some(*k);
-                            }
-                        }
-                        "value" => {
-                            if let Value::Long(vl) = val {
-                                value_opt = Some(*vl);
-                            }
-                        }
-                        _ => {}
-                    }
-                }
-                if let (Some(k), Some(v)) = (key_opt, value_opt) {
-                    map.insert(k, v);
-                }
-            }
-        }
-    }
-    map
-}
-
-fn parse_bytes_map_from_avro(
-    values: &Option<AvroValue>,
-) -> Option<std::collections::HashMap<i32, Vec<u8>>> {
-    use apache_avro::types::Value;
-    if let Some(Value::Map(obj)) = values {
-        let mut map = std::collections::HashMap::new();
-        for (k, v) in obj {
-            if let Value::Bytes(b) = v {
-                map.insert(k.parse::<i32>().unwrap_or(0), b.clone());
-            }
-        }
-        return Some(map);
-    }
-    if let Some(Value::Array(vec)) = values {
-        let mut map = std::collections::HashMap::new();
-        for item in vec {
-            if let Value::Record(fields) = item {
-                let mut key_opt = None;
-                let mut value_opt = None;
-                for (name, val) in fields {
-                    match name.as_str() {
-                        "key" => {
-                            if let Value::Int(k) = val {
-                                key_opt = Some(*k);
-                            }
-                        }
-                        "value" => {
-                            if let Value::Bytes(b) = val {
-                                value_opt = Some(b.clone());
-                            }
-                        }
-                        _ => {}
-                    }
-                }
-                if let (Some(k), Some(v)) = (key_opt, value_opt) {
-                    map.insert(k, v);
-                }
-            }
-        }
-        return Some(map);
-    }
-    None
-}
-
-// NOTE: These helpers mirror provider.rs logic, kept local to avoid cross-module deps.
-fn parse_partition_values(json: Option<&serde_json::Value>) -> Vec<Option<Literal>> {
-    match json {
-        Some(serde_json::Value::Array(arr)) => arr
-            .iter()
-            .map(|v| match v {
-                serde_json::Value::Null => None,
-                serde_json::Value::Bool(b) => {
-                    Some(Literal::Primitive(PrimitiveLiteral::Boolean(*b)))
-                }
-                serde_json::Value::Number(n) => {
-                    if let Some(i) = n.as_i64() {
-                        if i >= i32::MIN as i64 && i <= i32::MAX as i64 {
-                            Some(Literal::Primitive(PrimitiveLiteral::Int(i as i32)))
-                        } else {
-                            Some(Literal::Primitive(PrimitiveLiteral::Long(i)))
-                        }
-                    } else {
-                        n.as_f64().map(|f| {
-                            Literal::Primitive(PrimitiveLiteral::Double(
-                                ordered_float::OrderedFloat(f),
-                            ))
-                        })
-                    }
-                }
-                serde_json::Value::String(s) => {
-                    Some(Literal::Primitive(PrimitiveLiteral::String(s.clone())))
-                }
-                _ => None,
-            })
-            .collect(),
-        _ => Vec::new(),
-    }
-}
-
-fn parse_bounds_from_binary(
-    bounds_data: Option<&std::collections::HashMap<i32, Vec<u8>>>,
-    schema: &super::Schema,
-) -> std::collections::HashMap<i32, Datum> {
-    use crate::spec::Type;
-    let mut bounds = std::collections::HashMap::new();
-    if let Some(data) = bounds_data {
-        for (field_id, binary_data) in data {
-            if let Some(field) = schema.field_by_id(*field_id) {
-                let field_type = field.field_type.as_ref();
-                let datum = match field_type {
-                    Type::Primitive(prim_type) => {
-                        parse_primitive_bound(binary_data, prim_type).ok()
-                    }
-                    _ => None,
-                };
-                if let Some(d) = datum {
-                    bounds.insert(*field_id, d);
-                }
-            } else if let Ok(string_value) = String::from_utf8(binary_data.clone()) {
-                bounds.insert(
-                    *field_id,
-                    Datum::new(
-                        PrimitiveType::String,
-                        PrimitiveLiteral::String(string_value),
-                    ),
-                );
-            } else {
-                bounds.insert(
-                    *field_id,
-                    Datum::new(
-                        PrimitiveType::Binary,
-                        PrimitiveLiteral::Binary(binary_data.clone()),
-                    ),
-                );
-            }
-        }
-    }
-    bounds
-}
-
-fn parse_primitive_bound(
-    bytes: &[u8],
-    prim_type: &crate::spec::PrimitiveType,
-) -> Result<Datum, String> {
-    use num_bigint::BigInt;
-    use num_traits::ToPrimitive;
-
-    use crate::spec::PrimitiveType;
-    let literal = match prim_type {
-        PrimitiveType::Boolean => {
-            let val = !(bytes.len() == 1 && bytes[0] == 0u8);
-            PrimitiveLiteral::Boolean(val)
-        }
-        PrimitiveType::Int | PrimitiveType::Date => {
-            let val = i32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i32 bytes")?);
-            PrimitiveLiteral::Int(val)
-        }
-        PrimitiveType::Long
-        | PrimitiveType::Time
-        | PrimitiveType::Timestamp
-        | PrimitiveType::Timestamptz
-        | PrimitiveType::TimestampNs
-        | PrimitiveType::TimestamptzNs => {
-            let val = if bytes.len() == 4 {
-                i32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i32 bytes")?) as i64
-            } else {
-                i64::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i64 bytes")?)
-            };
-            PrimitiveLiteral::Long(val)
-        }
-        PrimitiveType::Float => {
-            let val = f32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f32 bytes")?);
-            PrimitiveLiteral::Float(ordered_float::OrderedFloat(val))
-        }
-        PrimitiveType::Double => {
-            let val = if bytes.len() == 4 {
-                f32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f32 bytes")?) as f64
-            } else {
-                f64::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f64 bytes")?)
-            };
-            PrimitiveLiteral::Double(ordered_float::OrderedFloat(val))
-        }
-        PrimitiveType::String => {
-            let val = std::str::from_utf8(bytes)
-                .map_err(|_| "Invalid UTF-8")?
-                .to_string();
-            PrimitiveLiteral::String(val)
-        }
-        PrimitiveType::Uuid => {
-            let val = u128::from_be_bytes(bytes.try_into().map_err(|_| "Invalid UUID bytes")?);
-            PrimitiveLiteral::UInt128(val)
-        }
-        PrimitiveType::Fixed(_) | PrimitiveType::Binary => {
-            PrimitiveLiteral::Binary(Vec::from(bytes))
-        }
-        PrimitiveType::Decimal { .. } => {
-            let unscaled_value = BigInt::from_signed_bytes_be(bytes);
-            let val = unscaled_value
-                .to_i128()
-                .ok_or_else(|| format!("Can't convert bytes to i128: {:?}", bytes))?;
-            PrimitiveLiteral::Int128(val)
-        }
-    };
-    Ok(Datum::new(prim_type.clone(), literal))
-}
-
-/// Metadata about a manifest file.
-#[derive(Debug, PartialEq, Eq, Clone)]
-pub struct ManifestMetadata {
-    /// The schema of the table when the manifest was written.
-    pub schema: SchemaRef,
-    /// ID of the schema used to write the manifest
-    pub schema_id: SchemaId,
-    /// The partition spec used to write the manifest.
-    pub partition_spec: PartitionSpec,
-    /// The format version of the manifest.
-    pub format_version: FormatVersion,
-    /// Type of content files tracked by the manifest: data or deletes
-    pub content: ManifestContentType,
-}
-
-impl ManifestMetadata {
-    /// Create new manifest metadata.
-    pub fn new(
-        schema: SchemaRef,
-        schema_id: SchemaId,
-        partition_spec: PartitionSpec,
-        format_version: FormatVersion,
-        content: ManifestContentType,
-    ) -> Self {
-        Self {
-            schema,
-            schema_id,
-            partition_spec,
-            format_version,
-            content,
-        }
-    }
-}
-
-/// Format version of Iceberg.
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub enum FormatVersion {
-    /// Version 1
-    V1 = 1,
-    /// Version 2
-    V2 = 2,
-}
-
-impl serde::Serialize for FormatVersion {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        serializer.serialize_i32(*self as i32)
-    }
-}
-
-impl<'de> serde::Deserialize<'de> for FormatVersion {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let value = i32::deserialize(deserializer)?;
-        match value {
-            1 => Ok(FormatVersion::V1),
-            2 => Ok(FormatVersion::V2),
-            _ => Err(serde::de::Error::custom(format!(
-                "Invalid format version: {}",
-                value
-            ))),
-        }
-    }
-}
-
-impl Default for FormatVersion {
-    fn default() -> Self {
-        Self::V2
-    }
-}
-
-/// Status of a manifest entry.
-#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
-#[serde(rename_all = "UPPERCASE")]
-pub enum ManifestStatus {
-    /// The data file was added in this snapshot.
-    Added,
-    /// The data file exists in the table.
-    Existing,
-    /// The data file was deleted in this snapshot.
-    Deleted,
-}
-
-/// Content type of a data file.
-#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
-#[serde(rename_all = "UPPERCASE")]
-pub enum DataContentType {
-    /// The file contains data.
-    Data,
-    /// The file contains position deletes.
-    PositionDeletes,
-    /// The file contains equality deletes.
-    EqualityDeletes,
-}
-
-/// File format of a data file.
-#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
-#[serde(rename_all = "UPPERCASE")]
-pub enum DataFileFormat {
-    /// Avro format
-    Avro,
-    /// ORC format
-    Orc,
-    /// Parquet format
-    Parquet,
-    /// Puffin format (for delete files)
-    Puffin,
-}
-
-/// A manifest entry represents a data file in a manifest.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
-pub struct ManifestEntry {
-    /// The status of the data file.
-    pub status: ManifestStatus,
-    /// The snapshot ID when the data file was added to the table.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub snapshot_id: Option<i64>,
-    /// The sequence number when the data file was added to the table.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub sequence_number: Option<i64>,
-    /// The file sequence number indicating when the file was added.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub file_sequence_number: Option<i64>,
-    /// The data file.
-    pub data_file: DataFile,
-}
-
-impl ManifestEntry {
-    /// Create a new manifest entry.
-    pub fn new(
-        status: ManifestStatus,
-        snapshot_id: Option<i64>,
-        sequence_number: Option<i64>,
-        file_sequence_number: Option<i64>,
-        data_file: DataFile,
-    ) -> Self {
-        Self {
-            status,
-            snapshot_id,
-            sequence_number,
-            file_sequence_number,
-            data_file,
-        }
-    }
-}
-
-/// A data file in Iceberg.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
-pub struct DataFile {
-    /// Type of content stored by the data file.
-    pub content: DataContentType,
-    /// Full URI for the file with FS scheme.
-    pub file_path: String,
-    /// File format name.
-    pub file_format: DataFileFormat,
-    /// Partition data tuple.
-    pub partition: Vec<Option<Literal>>,
-    /// Number of records in this file.
-    pub record_count: u64,
-    /// Total file size in bytes.
-    pub file_size_in_bytes: u64,
-    /// Map from column id to the total size on disk of all regions that store the column.
-    #[serde(skip_serializing_if = "HashMap::is_empty")]
-    pub column_sizes: HashMap<i32, u64>,
-    /// Map from column id to number of values in the column.
-    #[serde(skip_serializing_if = "HashMap::is_empty")]
-    pub value_counts: HashMap<i32, u64>,
-    /// Map from column id to number of null values in the column.
-    #[serde(skip_serializing_if = "HashMap::is_empty")]
-    pub null_value_counts: HashMap<i32, u64>,
-    /// Map from column id to number of NaN values in the column.
-    #[serde(skip_serializing_if = "HashMap::is_empty")]
-    pub nan_value_counts: HashMap<i32, u64>,
-    /// Map from column id to lower bound in the column.
-    #[serde(skip_serializing_if = "HashMap::is_empty")]
-    pub lower_bounds: HashMap<i32, Datum>,
-    /// Map from column id to upper bound in the column.
-    #[serde(skip_serializing_if = "HashMap::is_empty")]
-    pub upper_bounds: HashMap<i32, Datum>,
-    /// Block size in bytes.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub block_size_in_bytes: Option<i64>,
-    /// Implementation-specific key metadata for encryption.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub key_metadata: Option<Vec<u8>>,
-    /// Split offsets for the data file.
-    #[serde(skip_serializing_if = "Vec::is_empty")]
-    pub split_offsets: Vec<i64>,
-    /// Field ids used to determine row equality in equality delete files.
-    #[serde(skip_serializing_if = "Vec::is_empty")]
-    pub equality_ids: Vec<i32>,
-    /// ID representing sort order for this file.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub sort_order_id: Option<i32>,
-    /// The _row_id for the first row in the data file.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub first_row_id: Option<i64>,
-    /// The partition spec id used when writing this data file.
-    pub partition_spec_id: i32,
-    /// Fully qualified location of a data file that all deletes reference.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub referenced_data_file: Option<String>,
-    /// The offset in the file where the content starts (for deletion vectors).
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub content_offset: Option<i64>,
-    /// The size of the referenced content in bytes (for deletion vectors).
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub content_size_in_bytes: Option<i64>,
-}
-
-impl DataFile {
-    /// Create a new data file builder.
-    pub fn builder() -> DataFileBuilder {
-        DataFileBuilder::new()
-    }
-
-    /// Get the content type of the data file.
-    pub fn content_type(&self) -> DataContentType {
-        self.content
-    }
-
-    /// Get the file path.
-    pub fn file_path(&self) -> &str {
-        &self.file_path
-    }
-
-    /// Get the file format.
-    pub fn file_format(&self) -> DataFileFormat {
-        self.file_format
-    }
-
-    /// Get the partition values.
-    pub fn partition(&self) -> &[Option<Literal>] {
-        &self.partition
-    }
-
-    /// Get the record count.
-    pub fn record_count(&self) -> u64 {
-        self.record_count
-    }
-
-    /// Get the file size in bytes.
-    pub fn file_size_in_bytes(&self) -> u64 {
-        self.file_size_in_bytes
-    }
-
-    /// Get column sizes.
-    pub fn column_sizes(&self) -> &HashMap<i32, u64> {
-        &self.column_sizes
-    }
-
-    /// Get value counts.
-    pub fn value_counts(&self) -> &HashMap<i32, u64> {
-        &self.value_counts
-    }
-
-    /// Get null value counts.
-    pub fn null_value_counts(&self) -> &HashMap<i32, u64> {
-        &self.null_value_counts
-    }
-
-    /// Get NaN value counts.
-    pub fn nan_value_counts(&self) -> &HashMap<i32, u64> {
-        &self.nan_value_counts
-    }
-
-    /// Get lower bounds.
-    pub fn lower_bounds(&self) -> &HashMap<i32, Datum> {
-        &self.lower_bounds
-    }
-
-    /// Get upper bounds.
-    pub fn upper_bounds(&self) -> &HashMap<i32, Datum> {
-        &self.upper_bounds
-    }
-}
-
-/// Builder for creating data files.
-#[derive(Debug)]
-pub struct DataFileBuilder {
-    content: DataContentType,
-    file_path: Option<String>,
-    file_format: DataFileFormat,
-    partition: Vec<Option<Literal>>,
-    record_count: u64,
-    file_size_in_bytes: u64,
-    column_sizes: HashMap<i32, u64>,
-    value_counts: HashMap<i32, u64>,
-    null_value_counts: HashMap<i32, u64>,
-    nan_value_counts: HashMap<i32, u64>,
-    lower_bounds: HashMap<i32, Datum>,
-    upper_bounds: HashMap<i32, Datum>,
-    block_size_in_bytes: Option<i64>,
-    key_metadata: Option<Vec<u8>>,
-    split_offsets: Vec<i64>,
-    equality_ids: Vec<i32>,
-    sort_order_id: Option<i32>,
-    first_row_id: Option<i64>,
-    partition_spec_id: i32,
-    referenced_data_file: Option<String>,
-    content_offset: Option<i64>,
-    content_size_in_bytes: Option<i64>,
-}
-
-impl DataFileBuilder {
-    /// Create a new data file builder.
-    pub fn new() -> Self {
-        Self {
-            content: DataContentType::Data,
-            file_path: None,
-            file_format: DataFileFormat::Parquet,
-            partition: Vec::new(),
-            record_count: 0,
-            file_size_in_bytes: 0,
-            column_sizes: HashMap::new(),
-            value_counts: HashMap::new(),
-            null_value_counts: HashMap::new(),
-            nan_value_counts: HashMap::new(),
-            lower_bounds: HashMap::new(),
-            upper_bounds: HashMap::new(),
-            block_size_in_bytes: None,
-            key_metadata: None,
-            split_offsets: Vec::new(),
-            equality_ids: Vec::new(),
-            sort_order_id: None,
-            first_row_id: None,
-            partition_spec_id: 0,
-            referenced_data_file: None,
-            content_offset: None,
-            content_size_in_bytes: None,
-        }
-    }
-
-    /// Set the content type.
-    pub fn with_content(mut self, content: DataContentType) -> Self {
-        self.content = content;
-        self
-    }
-
-    /// Set the file path.
-    pub fn with_file_path(mut self, file_path: impl ToString) -> Self {
-        self.file_path = Some(file_path.to_string());
-        self
-    }
-
-    /// Set the file format.
-    pub fn with_file_format(mut self, file_format: DataFileFormat) -> Self {
-        self.file_format = file_format;
-        self
-    }
-
-    /// Set the partition values.
-    pub fn with_partition(mut self, partition: Vec<Option<Literal>>) -> Self {
-        self.partition = partition;
-        self
-    }
-
-    /// Set the record count.
-    pub fn with_record_count(mut self, record_count: u64) -> Self {
-        self.record_count = record_count;
-        self
-    }
-
-    /// Set the file size in bytes.
-    pub fn with_file_size_in_bytes(mut self, file_size_in_bytes: u64) -> Self {
-        self.file_size_in_bytes = file_size_in_bytes;
-        self
-    }
-
-    /// Set the partition spec id.
-    pub fn with_partition_spec_id(mut self, partition_spec_id: i32) -> Self {
-        self.partition_spec_id = partition_spec_id;
-        self
-    }
-
-    /// Add column size.
-    pub fn with_column_size(mut self, column_id: i32, size: u64) -> Self {
-        self.column_sizes.insert(column_id, size);
-        self
-    }
-
-    /// Add value count.
-    pub fn with_value_count(mut self, column_id: i32, count: u64) -> Self {
-        self.value_counts.insert(column_id, count);
-        self
-    }
-
-    /// Add null value count.
-    pub fn with_null_value_count(mut self, column_id: i32, count: u64) -> Self {
-        self.null_value_counts.insert(column_id, count);
-        self
-    }
-
-    /// Add lower bound.
-    pub fn with_lower_bound(mut self, column_id: i32, bound: Datum) -> Self {
-        self.lower_bounds.insert(column_id, bound);
-        self
-    }
-
-    /// Add upper bound.
-    pub fn with_upper_bound(mut self, column_id: i32, bound: Datum) -> Self {
-        self.upper_bounds.insert(column_id, bound);
-        self
-    }
-
-    /// Set the block size in bytes.
-    pub fn with_block_size_in_bytes(mut self, block_size_in_bytes: i64) -> Self {
-        self.block_size_in_bytes = Some(block_size_in_bytes);
-        self
-    }
-
-    /// Set the first row id.
-    pub fn with_first_row_id(mut self, first_row_id: i64) -> Self {
-        self.first_row_id = Some(first_row_id);
-        self
-    }
-
-    /// Set the referenced data file path.
-    pub fn with_referenced_data_file(mut self, path: impl ToString) -> Self {
-        self.referenced_data_file = Some(path.to_string());
-        self
-    }
-
-    /// Set the content offset and size in bytes.
-    pub fn with_content_offset_and_size(mut self, offset: i64, size_in_bytes: i64) -> Self {
-        self.content_offset = Some(offset);
-        self.content_size_in_bytes = Some(size_in_bytes);
-        self
-    }
-
-    /// Build the data file.
-    pub fn build(self) -> Result<DataFile, String> {
-        let file_path = self.file_path.ok_or("file_path is required")?;
-
-        Ok(DataFile {
-            content: self.content,
-            file_path,
-            file_format: self.file_format,
-            partition: self.partition,
-            record_count: self.record_count,
-            file_size_in_bytes: self.file_size_in_bytes,
-            column_sizes: self.column_sizes,
-            value_counts: self.value_counts,
-            null_value_counts: self.null_value_counts,
-            nan_value_counts: self.nan_value_counts,
-            lower_bounds: self.lower_bounds,
-            upper_bounds: self.upper_bounds,
-            block_size_in_bytes: self.block_size_in_bytes,
-            key_metadata: self.key_metadata,
-            split_offsets: self.split_offsets,
-            equality_ids: self.equality_ids,
-            sort_order_id: self.sort_order_id,
-            first_row_id: self.first_row_id,
-            partition_spec_id: self.partition_spec_id,
-            referenced_data_file: self.referenced_data_file,
-            content_offset: self.content_offset,
-            content_size_in_bytes: self.content_size_in_bytes,
-        })
-    }
-}
-
-impl Default for DataFileBuilder {
-    fn default() -> Self {
-        Self::new()
-    }
-}
diff --git a/crates/sail-iceberg/src/spec/manifest/_serde.rs b/crates/sail-iceberg/src/spec/manifest/_serde.rs
new file mode 100644
index 0000000000..5334aebc09
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/manifest/_serde.rs
@@ -0,0 +1,138 @@
+use apache_avro::types::Value as AvroValue;
+use serde::{Deserialize, Serialize};
+
+use super::{DataContentType, DataFile, DataFileFormat};
+use crate::spec::Schema;
+
+#[derive(Debug, Serialize, Deserialize)]
+pub(super) struct ManifestEntryAvro {
+    #[serde(rename = "status")]
+    pub status: i32,
+    #[serde(rename = "snapshot_id")]
+    pub snapshot_id: Option<i64>,
+    #[serde(rename = "sequence_number")]
+    pub sequence_number: Option<i64>,
+    #[serde(rename = "file_sequence_number")]
+    pub file_sequence_number: Option<i64>,
+    #[serde(rename = "data_file")]
+    pub data_file: DataFileAvro,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub(super) struct DataFileAvro {
+    #[serde(rename = "content", default)]
+    pub content: i32,
+    #[serde(rename = "file_path")]
+    pub file_path: String,
+    #[serde(rename = "file_format")]
+    pub file_format: String,
+    #[serde(rename = "partition")]
+    pub partition: serde_json::Value,
+    #[serde(rename = "record_count")]
+    pub record_count: i64,
+    #[serde(rename = "file_size_in_bytes")]
+    pub file_size_in_bytes: i64,
+    #[serde(skip)]
+    pub column_sizes: Option<AvroValue>,
+    #[serde(skip)]
+    pub value_counts: Option<AvroValue>,
+    #[serde(skip)]
+    pub null_value_counts: Option<AvroValue>,
+    #[serde(skip)]
+    pub nan_value_counts: Option<AvroValue>,
+    #[serde(skip)]
+    pub lower_bounds: Option<AvroValue>,
+    #[serde(skip)]
+    pub upper_bounds: Option<AvroValue>,
+    #[serde(rename = "key_metadata")]
+    pub key_metadata: Option<Vec<u8>>,
+    #[serde(rename = "split_offsets")]
+    pub split_offsets: Option<Vec<i64>>,
+    #[serde(rename = "equality_ids")]
+    pub equality_ids: Option<Vec<i64>>,
+    #[serde(rename = "sort_order_id")]
+    pub sort_order_id: Option<i32>,
+}
+
+impl DataFileAvro {
+    pub fn into_data_file(
+        self,
+        schema: &Schema,
+        _partition_type_len: i32,
+        partition_spec_id: i32,
+    ) -> DataFile {
+        let content = match self.content {
+            0 => DataContentType::Data,
+            1 => DataContentType::PositionDeletes,
+            2 => DataContentType::EqualityDeletes,
+            _ => DataContentType::Data,
+        };
+
+        let file_format = match self.file_format.to_uppercase().as_str() {
+            "PARQUET" => DataFileFormat::Parquet,
+            "AVRO" => DataFileFormat::Avro,
+            "ORC" => DataFileFormat::Orc,
+            _ => DataFileFormat::Parquet,
+        };
+
+        let partition = super::super::manifest::parse_partition_values(Some(&self.partition));
+
+        let column_sizes = super::super::manifest::parse_i64_map_from_avro(&self.column_sizes)
+            .into_iter()
+            .map(|(k, v)| (k, v as u64))
+            .collect();
+        let value_counts = super::super::manifest::parse_i64_map_from_avro(&self.value_counts)
+            .into_iter()
+            .map(|(k, v)| (k, v as u64))
+            .collect();
+        let null_value_counts =
+            super::super::manifest::parse_i64_map_from_avro(&self.null_value_counts)
+                .into_iter()
+                .map(|(k, v)| (k, v as u64))
+                .collect();
+        let nan_value_counts =
+            super::super::manifest::parse_i64_map_from_avro(&self.nan_value_counts)
+                .into_iter()
+                .map(|(k, v)| (k, v as u64))
+                .collect();
+
+        let lower_bounds_raw =
+            super::super::manifest::parse_bytes_map_from_avro(&self.lower_bounds);
+        let upper_bounds_raw =
+            super::super::manifest::parse_bytes_map_from_avro(&self.upper_bounds);
+        let lower_bounds =
+            super::super::manifest::parse_bounds_from_binary(lower_bounds_raw.as_ref(), schema);
+        let upper_bounds =
+            super::super::manifest::parse_bounds_from_binary(upper_bounds_raw.as_ref(), schema);
+
+        DataFile {
+            content,
+            file_path: self.file_path,
+            file_format,
+            partition,
+            record_count: self.record_count as u64,
+            file_size_in_bytes: self.file_size_in_bytes as u64,
+            column_sizes,
+            value_counts,
+            null_value_counts,
+            nan_value_counts,
+            lower_bounds,
+            upper_bounds,
+            block_size_in_bytes: None,
+            key_metadata: self.key_metadata,
+            split_offsets: self.split_offsets.unwrap_or_default(),
+            equality_ids: self
+                .equality_ids
+                .unwrap_or_default()
+                .into_iter()
+                .map(|v| v as i32)
+                .collect(),
+            sort_order_id: self.sort_order_id,
+            first_row_id: None,
+            partition_spec_id,
+            referenced_data_file: None,
+            content_offset: None,
+            content_size_in_bytes: None,
+        }
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/manifest/data_file.rs b/crates/sail-iceberg/src/spec/manifest/data_file.rs
new file mode 100644
index 0000000000..0de5adb645
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/manifest/data_file.rs
@@ -0,0 +1,105 @@
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+
+use crate::spec::{Datum, Literal};
+
+/// Content type of a data file.
+#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
+#[serde(rename_all = "UPPERCASE")]
+pub enum DataContentType {
+    Data,
+    PositionDeletes,
+    EqualityDeletes,
+}
+
+/// File format of a data file.
+#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
+#[serde(rename_all = "UPPERCASE")]
+pub enum DataFileFormat {
+    Avro,
+    Orc,
+    Parquet,
+    Puffin,
+}
+
+/// A data file in Iceberg.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+pub struct DataFile {
+    pub content: DataContentType,
+    pub file_path: String,
+    pub file_format: DataFileFormat,
+    pub partition: Vec<Option<Literal>>,
+    pub record_count: u64,
+    pub file_size_in_bytes: u64,
+    #[serde(skip_serializing_if = "HashMap::is_empty")]
+    pub column_sizes: HashMap<i32, u64>,
+    #[serde(skip_serializing_if = "HashMap::is_empty")]
+    pub value_counts: HashMap<i32, u64>,
+    #[serde(skip_serializing_if = "HashMap::is_empty")]
+    pub null_value_counts: HashMap<i32, u64>,
+    #[serde(skip_serializing_if = "HashMap::is_empty")]
+    pub nan_value_counts: HashMap<i32, u64>,
+    #[serde(skip_serializing_if = "HashMap::is_empty")]
+    pub lower_bounds: HashMap<i32, Datum>,
+    #[serde(skip_serializing_if = "HashMap::is_empty")]
+    pub upper_bounds: HashMap<i32, Datum>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub block_size_in_bytes: Option<i64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub key_metadata: Option<Vec<u8>>,
+    #[serde(skip_serializing_if = "Vec::is_empty")]
+    pub split_offsets: Vec<i64>,
+    #[serde(skip_serializing_if = "Vec::is_empty")]
+    pub equality_ids: Vec<i32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub sort_order_id: Option<i32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub first_row_id: Option<i64>,
+    pub partition_spec_id: i32,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub referenced_data_file: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub content_offset: Option<i64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub content_size_in_bytes: Option<i64>,
+}
+
+impl DataFile {
+    pub fn content_type(&self) -> DataContentType {
+        self.content
+    }
+    pub fn file_path(&self) -> &str {
+        &self.file_path
+    }
+    pub fn file_format(&self) -> DataFileFormat {
+        self.file_format
+    }
+    pub fn partition(&self) -> &[Option<Literal>] {
+        &self.partition
+    }
+    pub fn record_count(&self) -> u64 {
+        self.record_count
+    }
+    pub fn file_size_in_bytes(&self) -> u64 {
+        self.file_size_in_bytes
+    }
+    pub fn column_sizes(&self) -> &HashMap<i32, u64> {
+        &self.column_sizes
+    }
+    pub fn value_counts(&self) -> &HashMap<i32, u64> {
+        &self.value_counts
+    }
+    pub fn null_value_counts(&self) -> &HashMap<i32, u64> {
+        &self.null_value_counts
+    }
+    pub fn nan_value_counts(&self) -> &HashMap<i32, u64> {
+        &self.nan_value_counts
+    }
+    pub fn lower_bounds(&self) -> &HashMap<i32, Datum> {
+        &self.lower_bounds
+    }
+    pub fn upper_bounds(&self) -> &HashMap<i32, Datum> {
+        &self.upper_bounds
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/manifest/entry.rs b/crates/sail-iceberg/src/spec/manifest/entry.rs
new file mode 100644
index 0000000000..47b6c484ae
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/manifest/entry.rs
@@ -0,0 +1,43 @@
+use serde::{Deserialize, Serialize};
+
+use super::DataFile;
+
+/// Status of a manifest entry.
+#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
+#[serde(rename_all = "UPPERCASE")]
+pub enum ManifestStatus {
+    Added,
+    Existing,
+    Deleted,
+}
+
+/// A manifest entry represents a data file in a manifest.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+pub struct ManifestEntry {
+    pub status: ManifestStatus,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub snapshot_id: Option<i64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub sequence_number: Option<i64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub file_sequence_number: Option<i64>,
+    pub data_file: DataFile,
+}
+
+impl ManifestEntry {
+    pub fn new(
+        status: ManifestStatus,
+        snapshot_id: Option<i64>,
+        sequence_number: Option<i64>,
+        file_sequence_number: Option<i64>,
+        data_file: DataFile,
+    ) -> Self {
+        Self {
+            status,
+            snapshot_id,
+            sequence_number,
+            file_sequence_number,
+            data_file,
+        }
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/manifest/metadata.rs b/crates/sail-iceberg/src/spec/manifest/metadata.rs
new file mode 100644
index 0000000000..2333355fb2
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/manifest/metadata.rs
@@ -0,0 +1,94 @@
+use serde::{Deserialize, Serialize};
+
+use crate::spec::{
+    FormatVersion, ManifestContentType, PartitionSpec, Schema as IcebergSchema, SchemaId, SchemaRef,
+};
+
+/// Metadata about a manifest file.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+pub struct ManifestMetadata {
+    pub schema: SchemaRef,
+    pub schema_id: SchemaId,
+    pub partition_spec: PartitionSpec,
+    pub format_version: FormatVersion,
+    pub content: ManifestContentType,
+}
+
+impl ManifestMetadata {
+    pub fn new(
+        schema: SchemaRef,
+        schema_id: SchemaId,
+        partition_spec: PartitionSpec,
+        format_version: FormatVersion,
+        content: ManifestContentType,
+    ) -> Self {
+        Self {
+            schema,
+            schema_id,
+            partition_spec,
+            format_version,
+            content,
+        }
+    }
+
+    pub(crate) fn parse_from_avro_meta(
+        meta: &std::collections::HashMap<String, Vec<u8>>,
+    ) -> Result<Self, String> {
+        // schema
+        let schema_bs = meta
+            .get("schema")
+            .ok_or_else(|| "schema is required in manifest metadata but not found".to_string())?;
+        let schema: IcebergSchema = serde_json::from_slice(schema_bs)
+            .map_err(|e| format!("Fail to parse schema in manifest metadata: {e}"))?;
+        let schema_ref = std::sync::Arc::new(schema);
+
+        // schema-id (optional)
+        let schema_id: i32 = meta
+            .get("schema-id")
+            .and_then(|bs| String::from_utf8(bs.clone()).ok())
+            .and_then(|s| s.parse::<i32>().ok())
+            .unwrap_or(0);
+
+        // partition-spec and id
+        let part_fields_bs = meta.get("partition-spec").ok_or_else(|| {
+            "partition-spec is required in manifest metadata but not found".to_string()
+        })?;
+        let part_fields: Vec<crate::spec::partition::PartitionField> =
+            serde_json::from_slice(part_fields_bs)
+                .map_err(|e| format!("Fail to parse partition spec in manifest metadata: {e}"))?;
+        let spec_id: i32 = meta
+            .get("partition-spec-id")
+            .and_then(|bs| String::from_utf8(bs.clone()).ok())
+            .and_then(|s| s.parse::<i32>().ok())
+            .unwrap_or(0);
+        let mut builder = crate::spec::partition::PartitionSpec::builder().with_spec_id(spec_id);
+        for f in part_fields {
+            builder = builder.add_field_with_id(f.source_id, f.field_id, f.name, f.transform);
+        }
+        let partition_spec = builder.build();
+
+        // format-version
+        let format_version = meta
+            .get("format-version")
+            .and_then(|bs| serde_json::from_slice::<crate::spec::FormatVersion>(bs).ok())
+            .unwrap_or(crate::spec::FormatVersion::V1);
+
+        // content
+        let content = meta
+            .get("content")
+            .and_then(|bs| String::from_utf8(bs.clone()).ok())
+            .map(|s| match s.to_ascii_lowercase().as_str() {
+                "deletes" => crate::spec::manifest_list::ManifestContentType::Deletes,
+                _ => crate::spec::manifest_list::ManifestContentType::Data,
+            })
+            .unwrap_or(crate::spec::manifest_list::ManifestContentType::Data);
+
+        Ok(ManifestMetadata::new(
+            schema_ref,
+            schema_id,
+            partition_spec,
+            format_version,
+            content,
+        ))
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/manifest/mod.rs b/crates/sail-iceberg/src/spec/manifest/mod.rs
new file mode 100644
index 0000000000..ef61ba5c5d
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/manifest/mod.rs
@@ -0,0 +1,331 @@
+use std::sync::Arc;
+
+use apache_avro::{from_value as avro_from_value, Reader as AvroReader};
+
+use super::Schema;
+
+mod _serde;
+mod data_file;
+mod entry;
+mod metadata;
+
+pub use data_file::*;
+pub use entry::*;
+pub use metadata::*;
+
+/// Reference to [`ManifestEntry`].
+pub type ManifestEntryRef = Arc<ManifestEntry>;
+
+/// A manifest contains metadata and a list of entries.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct Manifest {
+    /// Metadata about the manifest.
+    pub metadata: ManifestMetadata,
+    /// Entries in the manifest.
+    pub entries: Vec<ManifestEntryRef>,
+}
+
+impl Manifest {
+    /// Create a new manifest.
+    pub fn new(metadata: ManifestMetadata, entries: Vec<ManifestEntry>) -> Self {
+        Self {
+            metadata,
+            entries: entries.into_iter().map(Arc::new).collect(),
+        }
+    }
+
+    /// Get the entries in the manifest.
+    pub fn entries(&self) -> &[ManifestEntryRef] {
+        &self.entries
+    }
+
+    /// Get the metadata of the manifest.
+    pub fn metadata(&self) -> &ManifestMetadata {
+        &self.metadata
+    }
+
+    /// Consume this Manifest, returning its constituent parts
+    pub fn into_parts(self) -> (Vec<ManifestEntryRef>, ManifestMetadata) {
+        let Self { entries, metadata } = self;
+        (entries, metadata)
+    }
+
+    /// Parse manifest metadata and entries from bytes of avro file.
+    pub(crate) fn try_from_avro_bytes(
+        bs: &[u8],
+    ) -> Result<(ManifestMetadata, Vec<ManifestEntry>), String> {
+        let reader = AvroReader::new(bs).map_err(|e| format!("Avro read error: {e}"))?;
+
+        // Parse manifest metadata from avro user metadata
+        let meta = reader.user_metadata();
+        let metadata = ManifestMetadata::parse_from_avro_meta(meta)?;
+
+        // Determine partition type to guide value decoding when needed
+        let partition_type = metadata
+            .partition_spec
+            .partition_type(&metadata.schema)
+            .map_err(|e| format!("Partition type error: {e}"))?;
+
+        // For entries, reuse the embedded schema in the Avro file and deserialize per record
+        let reader = AvroReader::new(bs).map_err(|e| format!("Avro read error: {e}"))?;
+        let mut entries = Vec::new();
+        for value in reader {
+            let value = value.map_err(|e| format!("Avro read value error: {e}"))?;
+            let entry_avro: _serde::ManifestEntryAvro =
+                avro_from_value(&value).map_err(|e| format!("Avro decode entry error: {e}"))?;
+            let data_file = entry_avro.data_file.into_data_file(
+                &metadata.schema,
+                partition_type.fields().len() as i32,
+                metadata.partition_spec.spec_id(),
+            );
+            let status = match entry_avro.status {
+                1 => ManifestStatus::Added,
+                2 => ManifestStatus::Deleted,
+                _ => ManifestStatus::Existing,
+            };
+            let entry = ManifestEntry::new(
+                status,
+                entry_avro.snapshot_id,
+                entry_avro.sequence_number,
+                entry_avro.file_sequence_number,
+                data_file,
+            );
+            entries.push(entry);
+        }
+
+        Ok((metadata, entries))
+    }
+
+    /// Parse a manifest from bytes of avro file.
+    pub fn parse_avro(bs: &[u8]) -> Result<Self, String> {
+        let (metadata, entries) = Self::try_from_avro_bytes(bs)?;
+        Ok(Manifest::new(metadata, entries))
+    }
+}
+
+// Helper functions used by Avro serde to parse partition values and bounds
+use crate::spec::datatypes::Type;
+use crate::spec::values::Literal;
+use crate::spec::{Datum, PrimitiveLiteral, PrimitiveType};
+
+pub(super) fn parse_partition_values(json: Option<&serde_json::Value>) -> Vec<Option<Literal>> {
+    match json {
+        Some(serde_json::Value::Array(arr)) => arr
+            .iter()
+            .map(|v| match v {
+                serde_json::Value::Null => None,
+                serde_json::Value::Bool(b) => {
+                    Some(Literal::Primitive(PrimitiveLiteral::Boolean(*b)))
+                }
+                serde_json::Value::Number(n) => {
+                    if let Some(i) = n.as_i64() {
+                        if i >= i32::MIN as i64 && i <= i32::MAX as i64 {
+                            Some(Literal::Primitive(PrimitiveLiteral::Int(i as i32)))
+                        } else {
+                            Some(Literal::Primitive(PrimitiveLiteral::Long(i)))
+                        }
+                    } else {
+                        n.as_f64().map(|f| {
+                            Literal::Primitive(PrimitiveLiteral::Double(
+                                ordered_float::OrderedFloat(f),
+                            ))
+                        })
+                    }
+                }
+                serde_json::Value::String(s) => {
+                    Some(Literal::Primitive(PrimitiveLiteral::String(s.clone())))
+                }
+                _ => None,
+            })
+            .collect(),
+        _ => Vec::new(),
+    }
+}
+
+pub(super) fn parse_i64_map_from_avro(
+    values: &Option<apache_avro::types::Value>,
+) -> std::collections::HashMap<i32, i64> {
+    use apache_avro::types::Value;
+    let mut map = std::collections::HashMap::new();
+    if let Some(Value::Map(obj)) = values {
+        for (k, v) in obj {
+            if let Value::Long(i) = v {
+                map.insert(k.parse::<i32>().unwrap_or(0), *i);
+            }
+        }
+        return map;
+    }
+    if let Some(Value::Array(vec)) = values {
+        for item in vec {
+            if let Value::Record(fields) = item {
+                let mut key_opt = None;
+                let mut value_opt = None;
+                for (name, val) in fields {
+                    match name.as_str() {
+                        "key" => {
+                            if let Value::Int(k) = val {
+                                key_opt = Some(*k);
+                            }
+                        }
+                        "value" => {
+                            if let Value::Long(vl) = val {
+                                value_opt = Some(*vl);
+                            }
+                        }
+                        _ => {}
+                    }
+                }
+                if let (Some(k), Some(v)) = (key_opt, value_opt) {
+                    map.insert(k, v);
+                }
+            }
+        }
+    }
+    map
+}
+
+pub(super) fn parse_bytes_map_from_avro(
+    values: &Option<apache_avro::types::Value>,
+) -> Option<std::collections::HashMap<i32, Vec<u8>>> {
+    use apache_avro::types::Value;
+    if let Some(Value::Map(obj)) = values {
+        let mut map = std::collections::HashMap::new();
+        for (k, v) in obj {
+            if let Value::Bytes(b) = v {
+                map.insert(k.parse::<i32>().unwrap_or(0), b.clone());
+            }
+        }
+        return Some(map);
+    }
+    if let Some(Value::Array(vec)) = values {
+        let mut map = std::collections::HashMap::new();
+        for item in vec {
+            if let Value::Record(fields) = item {
+                let mut key_opt = None;
+                let mut value_opt = None;
+                for (name, val) in fields {
+                    match name.as_str() {
+                        "key" => {
+                            if let Value::Int(k) = val {
+                                key_opt = Some(*k);
+                            }
+                        }
+                        "value" => {
+                            if let Value::Bytes(b) = val {
+                                value_opt = Some(b.clone());
+                            }
+                        }
+                        _ => {}
+                    }
+                }
+                if let (Some(k), Some(v)) = (key_opt, value_opt) {
+                    map.insert(k, v);
+                }
+            }
+        }
+        return Some(map);
+    }
+    None
+}
+
+pub(super) fn parse_bounds_from_binary(
+    bounds_data: Option<&std::collections::HashMap<i32, Vec<u8>>>,
+    schema: &Schema,
+) -> std::collections::HashMap<i32, Datum> {
+    let mut bounds = std::collections::HashMap::new();
+    if let Some(data) = bounds_data {
+        for (field_id, binary_data) in data {
+            if let Some(field) = schema.field_by_id(*field_id) {
+                let field_type = field.field_type.as_ref();
+                let datum = match field_type {
+                    Type::Primitive(prim_type) => {
+                        parse_primitive_bound(binary_data, prim_type).ok()
+                    }
+                    _ => None,
+                };
+                if let Some(d) = datum {
+                    bounds.insert(*field_id, d);
+                }
+            } else if let Ok(string_value) = String::from_utf8(binary_data.clone()) {
+                bounds.insert(
+                    *field_id,
+                    Datum::new(
+                        PrimitiveType::String,
+                        PrimitiveLiteral::String(string_value),
+                    ),
+                );
+            } else {
+                bounds.insert(
+                    *field_id,
+                    Datum::new(
+                        PrimitiveType::Binary,
+                        PrimitiveLiteral::Binary(binary_data.clone()),
+                    ),
+                );
+            }
+        }
+    }
+    bounds
+}
+
+fn parse_primitive_bound(bytes: &[u8], prim_type: &PrimitiveType) -> Result<Datum, String> {
+    use num_bigint::BigInt;
+    use num_traits::ToPrimitive;
+
+    let literal = match prim_type {
+        PrimitiveType::Boolean => {
+            let val = !(bytes.len() == 1 && bytes[0] == 0u8);
+            PrimitiveLiteral::Boolean(val)
+        }
+        PrimitiveType::Int | PrimitiveType::Date => {
+            let val = i32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i32 bytes")?);
+            PrimitiveLiteral::Int(val)
+        }
+        PrimitiveType::Long
+        | PrimitiveType::Time
+        | PrimitiveType::Timestamp
+        | PrimitiveType::Timestamptz
+        | PrimitiveType::TimestampNs
+        | PrimitiveType::TimestamptzNs => {
+            let val = if bytes.len() == 4 {
+                i32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i32 bytes")?) as i64
+            } else {
+                i64::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i64 bytes")?)
+            };
+            PrimitiveLiteral::Long(val)
+        }
+        PrimitiveType::Float => {
+            let val = f32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f32 bytes")?);
+            PrimitiveLiteral::Float(ordered_float::OrderedFloat(val))
+        }
+        PrimitiveType::Double => {
+            let val = if bytes.len() == 4 {
+                f32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f32 bytes")?) as f64
+            } else {
+                f64::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f64 bytes")?)
+            };
+            PrimitiveLiteral::Double(ordered_float::OrderedFloat(val))
+        }
+        PrimitiveType::String => {
+            let val = std::str::from_utf8(bytes)
+                .map_err(|_| "Invalid UTF-8")?
+                .to_string();
+            PrimitiveLiteral::String(val)
+        }
+        PrimitiveType::Uuid => {
+            let val = u128::from_be_bytes(bytes.try_into().map_err(|_| "Invalid UUID bytes")?);
+            PrimitiveLiteral::UInt128(val)
+        }
+        PrimitiveType::Fixed(_) | PrimitiveType::Binary => {
+            PrimitiveLiteral::Binary(Vec::from(bytes))
+        }
+        PrimitiveType::Decimal { .. } => {
+            let unscaled_value = BigInt::from_signed_bytes_be(bytes);
+            let val = unscaled_value
+                .to_i128()
+                .ok_or_else(|| format!("Can't convert bytes to i128: {:?}", bytes))?;
+            PrimitiveLiteral::Int128(val)
+        }
+    };
+    Ok(Datum::new(prim_type.clone(), literal))
+}
diff --git a/crates/sail-iceberg/src/spec/manifest/writer.rs b/crates/sail-iceberg/src/spec/manifest/writer.rs
new file mode 100644
index 0000000000..22ebe298b0
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/manifest/writer.rs
@@ -0,0 +1,5 @@
+// Awareness stub for non-read path
+// TODO: Implement writer support if/when write path is added.
+#[allow(dead_code)]
+#[derive(Debug, Clone)]
+pub struct ManifestWriter;
diff --git a/crates/sail-iceberg/src/spec/manifest_list.rs b/crates/sail-iceberg/src/spec/manifest_list.rs
index ef00f13b50..c4ff07a837 100644
--- a/crates/sail-iceberg/src/spec/manifest_list.rs
+++ b/crates/sail-iceberg/src/spec/manifest_list.rs
@@ -2,6 +2,7 @@ use apache_avro::{from_value as avro_from_value, Reader as AvroReader};
 use serde::{Deserialize, Serialize};
 
 use super::values::{Literal, PrimitiveLiteral};
+use crate::spec::FormatVersion;
 
 pub const UNASSIGNED_SEQUENCE_NUMBER: i64 = -1;
 
@@ -41,61 +42,39 @@ impl ManifestList {
     }
 
     /// Parse manifest list from bytes with a specified version.
-    pub fn parse_with_version(
-        bs: &[u8],
-        _version: super::FormatVersion,
-    ) -> Result<ManifestList, String> {
-        // Decode per-record to avoid array-level serde issues; field aliases/defaults cover V1/V2
-        let reader = AvroReader::new(bs).map_err(|e| format!("Avro read error: {e}"))?;
-        let mut manifest_files = Vec::new();
-        for value in reader {
-            let value = value.map_err(|e| format!("Avro read value error: {e}"))?;
-            let mf_avro: ManifestFileAvro =
-                avro_from_value(&value).map_err(|e| format!("Avro decode error: {e}"))?;
-            manifest_files.push(mf_avro.into());
+    pub fn parse_with_version(bs: &[u8], version: FormatVersion) -> Result<ManifestList, String> {
+        match version {
+            FormatVersion::V1 => {
+                let reader = AvroReader::new(bs).map_err(|e| format!("Avro read error: {e}"))?;
+                let mut entries = Vec::new();
+                for value in reader {
+                    let value = value.map_err(|e| format!("Avro read value error: {e}"))?;
+                    let v1: _serde::ManifestFileV1 =
+                        avro_from_value(&value).map_err(|e| format!("Avro decode error: {e}"))?;
+                    entries.push(ManifestFile::from(v1));
+                }
+                Ok(ManifestList::new(entries))
+            }
+            FormatVersion::V2 => {
+                let reader = AvroReader::new(bs).map_err(|e| format!("Avro read error: {e}"))?;
+                let mut entries = Vec::new();
+                for value in reader {
+                    let value = value.map_err(|e| format!("Avro read value error: {e}"))?;
+                    let v2: _serde::ManifestFileV2 =
+                        avro_from_value(&value).map_err(|e| format!("Avro decode error: {e}"))?;
+                    entries.push(ManifestFile::from(v2));
+                }
+                Ok(ManifestList::new(entries))
+            }
         }
-        Ok(ManifestList::new(manifest_files))
     }
 }
 
-#[derive(Debug, Serialize, Deserialize)]
-#[serde(rename_all = "kebab-case")]
-struct ManifestFileAvro {
-    #[serde(rename = "manifest_path")]
-    manifest_path: String,
-    #[serde(rename = "manifest_length")]
-    manifest_length: i64,
-    #[serde(rename = "partition_spec_id")]
-    partition_spec_id: i32,
-    #[serde(rename = "content")]
-    content: i32,
-    #[serde(rename = "sequence_number")]
-    sequence_number: i64,
-    #[serde(rename = "min_sequence_number")]
-    min_sequence_number: i64,
-    #[serde(rename = "added_snapshot_id")]
-    added_snapshot_id: i64,
-    #[serde(alias = "added_data_files_count", rename = "added_files_count")]
-    added_files_count: i32,
-    #[serde(alias = "existing_data_files_count", rename = "existing_files_count")]
-    existing_files_count: i32,
-    #[serde(alias = "deleted_data_files_count", rename = "deleted_files_count")]
-    deleted_files_count: i32,
-    #[serde(rename = "added_rows_count")]
-    added_rows_count: i64,
-    #[serde(rename = "existing_rows_count")]
-    existing_rows_count: i64,
-    #[serde(rename = "deleted_rows_count")]
-    deleted_rows_count: i64,
-    #[serde(rename = "partitions")]
-    partitions: Option<Vec<FieldSummaryAvro>>,
-    #[serde(rename = "key_metadata")]
-    key_metadata: Option<Vec<u8>>,
-}
+// removed duplicate early _serde block; see single _serde module below
 
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, Clone)]
 #[serde(rename_all = "kebab-case")]
-struct FieldSummaryAvro {
+pub struct FieldSummaryAvro {
     #[serde(rename = "contains_null")]
     contains_null: bool,
     #[serde(rename = "contains_nan")]
@@ -106,42 +85,43 @@ struct FieldSummaryAvro {
     upper_bound: Option<Vec<u8>>,
 }
 
-impl From<ManifestFileAvro> for ManifestFile {
-    fn from(avro: ManifestFileAvro) -> Self {
+impl From<FieldSummaryAvro> for FieldSummary {
+    fn from(summary: FieldSummaryAvro) -> Self {
+        let lower_bound = summary
+            .lower_bound
+            .and_then(|bytes| String::from_utf8(bytes).ok())
+            .map(|s| Literal::Primitive(PrimitiveLiteral::String(s)));
+
+        let upper_bound = summary
+            .upper_bound
+            .and_then(|bytes| String::from_utf8(bytes).ok())
+            .map(|s| Literal::Primitive(PrimitiveLiteral::String(s)));
+
+        let mut field_summary = FieldSummary::new(summary.contains_null);
+        if let Some(contains_nan) = summary.contains_nan {
+            field_summary = field_summary.with_contains_nan(contains_nan);
+        }
+        if let Some(lower) = lower_bound {
+            field_summary = field_summary.with_lower_bound(lower);
+        }
+        if let Some(upper) = upper_bound {
+            field_summary = field_summary.with_upper_bound(upper);
+        }
+        field_summary
+    }
+}
+
+impl From<_serde::ManifestFileV2> for ManifestFile {
+    fn from(avro: _serde::ManifestFileV2) -> Self {
         let content = match avro.content {
             0 => ManifestContentType::Data,
             1 => ManifestContentType::Deletes,
             _ => ManifestContentType::Data,
         };
 
-        let partitions = avro.partitions.map(|summaries| {
-            summaries
-                .into_iter()
-                .map(|summary| {
-                    let lower_bound = summary
-                        .lower_bound
-                        .and_then(|bytes| String::from_utf8(bytes).ok())
-                        .map(|s| Literal::Primitive(PrimitiveLiteral::String(s)));
-
-                    let upper_bound = summary
-                        .upper_bound
-                        .and_then(|bytes| String::from_utf8(bytes).ok())
-                        .map(|s| Literal::Primitive(PrimitiveLiteral::String(s)));
-
-                    let mut field_summary = FieldSummary::new(summary.contains_null);
-                    if let Some(contains_nan) = summary.contains_nan {
-                        field_summary = field_summary.with_contains_nan(contains_nan);
-                    }
-                    if let Some(lower) = lower_bound {
-                        field_summary = field_summary.with_lower_bound(lower);
-                    }
-                    if let Some(upper) = upper_bound {
-                        field_summary = field_summary.with_upper_bound(upper);
-                    }
-                    field_summary
-                })
-                .collect()
-        });
+        let partitions = avro
+            .partitions
+            .map(|summaries| summaries.into_iter().map(FieldSummary::from).collect());
 
         ManifestFile {
             manifest_path: avro.manifest_path,
@@ -446,123 +426,90 @@ pub(super) mod _serde {
     use super::*;
 
     #[derive(Debug, Serialize, Deserialize)]
-    #[serde(transparent)]
-    pub(crate) struct ManifestListV1 {
-        entries: Vec<ManifestFileV1>,
-    }
-
-    #[derive(Debug, Serialize, Deserialize)]
-    pub(crate) struct ManifestListV2 {
-        entries: Vec<ManifestFileV2>,
-    }
-
-    #[derive(Debug, Serialize, Deserialize)]
-    pub(super) struct ManifestFileV1 {
+    #[serde(rename_all = "kebab-case")]
+    pub struct ManifestFileV1 {
+        #[serde(rename = "manifest_path")]
         pub manifest_path: String,
+        #[serde(rename = "manifest_length")]
         pub manifest_length: i64,
+        #[serde(rename = "partition_spec_id")]
         pub partition_spec_id: i32,
+        #[serde(rename = "added_snapshot_id")]
         pub added_snapshot_id: i64,
+        #[serde(rename = "added_data_files_count")]
         pub added_data_files_count: Option<i32>,
+        #[serde(rename = "existing_data_files_count")]
         pub existing_data_files_count: Option<i32>,
+        #[serde(rename = "deleted_data_files_count")]
         pub deleted_data_files_count: Option<i32>,
+        #[serde(rename = "added_rows_count")]
         pub added_rows_count: Option<i64>,
+        #[serde(rename = "existing_rows_count")]
         pub existing_rows_count: Option<i64>,
+        #[serde(rename = "deleted_rows_count")]
         pub deleted_rows_count: Option<i64>,
-        pub partitions: Option<Vec<FieldSummary>>,
+        #[serde(rename = "partitions")]
+        pub partitions: Option<Vec<FieldSummaryAvro>>, // V1 uses same summary encoding
+        #[serde(rename = "key_metadata")]
         pub key_metadata: Option<Vec<u8>>,
     }
 
     #[derive(Debug, Serialize, Deserialize)]
-    pub(super) struct ManifestFileV2 {
+    #[serde(rename_all = "kebab-case")]
+    pub struct ManifestFileV2 {
+        #[serde(rename = "manifest_path")]
         pub manifest_path: String,
+        #[serde(rename = "manifest_length")]
         pub manifest_length: i64,
+        #[serde(rename = "partition_spec_id")]
         pub partition_spec_id: i32,
-        #[serde(default = "v2_default_content_for_v1")]
+        #[serde(rename = "content")]
         pub content: i32,
-        #[serde(default = "v2_default_sequence_number_for_v1")]
+        #[serde(rename = "sequence_number")]
         pub sequence_number: i64,
-        #[serde(default = "v2_default_min_sequence_number_for_v1")]
+        #[serde(rename = "min_sequence_number")]
         pub min_sequence_number: i64,
-        #[serde(alias = "added_data_files_count", alias = "added_files_count")]
+        #[serde(rename = "added_snapshot_id")]
+        pub added_snapshot_id: i64,
+        #[serde(rename = "added_files_count")]
         pub added_files_count: i32,
-        #[serde(alias = "existing_data_files_count", alias = "existing_files_count")]
+        #[serde(rename = "existing_files_count")]
         pub existing_files_count: i32,
-        #[serde(alias = "deleted_data_files_count", alias = "deleted_files_count")]
+        #[serde(rename = "deleted_files_count")]
         pub deleted_files_count: i32,
-        pub added_snapshot_id: i64,
+        #[serde(rename = "added_rows_count")]
         pub added_rows_count: i64,
+        #[serde(rename = "existing_rows_count")]
         pub existing_rows_count: i64,
+        #[serde(rename = "deleted_rows_count")]
         pub deleted_rows_count: i64,
-        pub partitions: Option<Vec<FieldSummary>>,
+        #[serde(rename = "partitions")]
+        pub partitions: Option<Vec<FieldSummaryAvro>>,
+        #[serde(rename = "key_metadata")]
         pub key_metadata: Option<Vec<u8>>,
     }
+}
 
-    const fn v2_default_content_for_v1() -> i32 {
-        super::ManifestContentType::Data as i32
-    }
-    const fn v2_default_sequence_number_for_v1() -> i64 {
-        0
-    }
-    const fn v2_default_min_sequence_number_for_v1() -> i64 {
-        0
-    }
-
-    impl TryFrom<ManifestListV1> for super::ManifestList {
-        type Error = String;
-        fn try_from(v1: ManifestListV1) -> Result<Self, Self::Error> {
-            let entries = v1
-                .entries
-                .into_iter()
-                .map(|e| ManifestFile {
-                    manifest_path: e.manifest_path,
-                    manifest_length: e.manifest_length,
-                    partition_spec_id: e.partition_spec_id,
-                    content: ManifestContentType::Data,
-                    sequence_number: 0,
-                    min_sequence_number: 0,
-                    added_snapshot_id: e.added_snapshot_id,
-                    added_files_count: e.added_data_files_count,
-                    existing_files_count: e.existing_data_files_count,
-                    deleted_files_count: e.deleted_data_files_count,
-                    added_rows_count: e.added_rows_count,
-                    existing_rows_count: e.existing_rows_count,
-                    deleted_rows_count: e.deleted_rows_count,
-                    partitions: e.partitions,
-                    key_metadata: e.key_metadata,
-                })
-                .collect();
-            Ok(super::ManifestList::new(entries))
-        }
-    }
-
-    impl TryFrom<ManifestListV2> for super::ManifestList {
-        type Error = String;
-        fn try_from(v2: ManifestListV2) -> Result<Self, Self::Error> {
-            let entries = v2
-                .entries
-                .into_iter()
-                .map(|e| ManifestFile {
-                    manifest_path: e.manifest_path,
-                    manifest_length: e.manifest_length,
-                    partition_spec_id: e.partition_spec_id,
-                    content: match e.content {
-                        1 => ManifestContentType::Deletes,
-                        _ => ManifestContentType::Data,
-                    },
-                    sequence_number: e.sequence_number,
-                    min_sequence_number: e.min_sequence_number,
-                    added_snapshot_id: e.added_snapshot_id,
-                    added_files_count: Some(e.added_files_count),
-                    existing_files_count: Some(e.existing_files_count),
-                    deleted_files_count: Some(e.deleted_files_count),
-                    added_rows_count: Some(e.added_rows_count),
-                    existing_rows_count: Some(e.existing_rows_count),
-                    deleted_rows_count: Some(e.deleted_rows_count),
-                    partitions: e.partitions,
-                    key_metadata: e.key_metadata,
-                })
-                .collect();
-            Ok(super::ManifestList::new(entries))
+impl From<_serde::ManifestFileV1> for ManifestFile {
+    fn from(v1: _serde::ManifestFileV1) -> Self {
+        ManifestFile {
+            manifest_path: v1.manifest_path,
+            manifest_length: v1.manifest_length,
+            partition_spec_id: v1.partition_spec_id,
+            content: ManifestContentType::Data,
+            sequence_number: 0,
+            min_sequence_number: 0,
+            added_snapshot_id: v1.added_snapshot_id,
+            added_files_count: v1.added_data_files_count,
+            existing_files_count: v1.existing_data_files_count,
+            deleted_files_count: v1.deleted_data_files_count,
+            added_rows_count: v1.added_rows_count,
+            existing_rows_count: v1.existing_rows_count,
+            deleted_rows_count: v1.deleted_rows_count,
+            partitions: v1
+                .partitions
+                .map(|v| v.into_iter().map(FieldSummary::from).collect()),
+            key_metadata: v1.key_metadata,
         }
     }
 }
diff --git a/crates/sail-iceberg/src/spec/mod.rs b/crates/sail-iceberg/src/spec/mod.rs
index 3fe8620414..c9193d44bd 100644
--- a/crates/sail-iceberg/src/spec/mod.rs
+++ b/crates/sail-iceberg/src/spec/mod.rs
@@ -1,25 +1,39 @@
 pub mod datatypes;
+pub mod encrypted_key;
+pub mod format;
 pub mod manifest;
 pub mod manifest_list;
 pub mod name_mapping;
 pub mod partition;
 pub mod schema;
 pub mod snapshot;
+pub mod snapshot_summary;
 pub mod sort;
-pub mod statistics;
+pub mod statistic_file;
 pub mod table_metadata;
+pub mod table_metadata_builder;
 pub mod transform;
 pub mod values;
+pub mod view_metadata;
+pub mod view_metadata_builder;
+pub mod view_version;
 
 pub use datatypes::*;
+pub use encrypted_key::*;
+pub use format::*;
 pub use manifest::*;
 pub use manifest_list::*;
 pub use name_mapping::*;
 pub use partition::*;
 pub use schema::*;
 pub use snapshot::*;
+pub use snapshot_summary::*;
 pub use sort::*;
-pub use statistics::*;
+pub use statistic_file::*;
 pub use table_metadata::*;
+pub use table_metadata_builder::*;
 pub use transform::*;
 pub use values::*;
+pub use view_metadata::*;
+pub use view_metadata_builder::*;
+pub use view_version::*;
diff --git a/crates/sail-iceberg/src/spec/name_mapping.rs b/crates/sail-iceberg/src/spec/name_mapping/mod.rs
similarity index 99%
rename from crates/sail-iceberg/src/spec/name_mapping.rs
rename to crates/sail-iceberg/src/spec/name_mapping/mod.rs
index 858b73f9d8..1656196d10 100644
--- a/crates/sail-iceberg/src/spec/name_mapping.rs
+++ b/crates/sail-iceberg/src/spec/name_mapping/mod.rs
@@ -49,12 +49,10 @@ impl MappedField {
     pub fn field_id(&self) -> Option<i32> {
         self.field_id
     }
-
     /// All names for this field
     pub fn names(&self) -> &[String] {
         &self.names
     }
-
     /// Child mapped fields
     pub fn fields(&self) -> &[Arc<MappedField>] {
         &self.fields
diff --git a/crates/sail-iceberg/src/spec/partition.rs b/crates/sail-iceberg/src/spec/partition.rs
index fcf0c8a894..07ffcf022d 100644
--- a/crates/sail-iceberg/src/spec/partition.rs
+++ b/crates/sail-iceberg/src/spec/partition.rs
@@ -122,6 +122,19 @@ impl PartitionSpec {
         self.fields.iter().map(|f| f.field_id).max()
     }
 
+    /// Check if the partition spec has sequential field ids starting from 1000.
+    /// Required for spec version 1 in the reference implementation.
+    pub fn has_sequential_ids(&self) -> bool {
+        let mut expected = 1000;
+        for field in &self.fields {
+            if field.field_id != expected {
+                return false;
+            }
+            expected += 1;
+        }
+        true
+    }
+
     /// Check if this partition spec is compatible with another partition spec.
     ///
     /// Returns true if the partition spec is equal to the other spec with partition field ids ignored and
diff --git a/crates/sail-iceberg/src/spec/snapshot_summary.rs b/crates/sail-iceberg/src/spec/snapshot_summary.rs
new file mode 100644
index 0000000000..ac8fc8de4e
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/snapshot_summary.rs
@@ -0,0 +1,4 @@
+// Awareness stub for read-path. Extend later if needed.
+#[allow(dead_code)]
+#[derive(Debug, Clone)]
+pub struct SnapshotSummary;
diff --git a/crates/sail-iceberg/src/spec/statistic_file.rs b/crates/sail-iceberg/src/spec/statistic_file.rs
new file mode 100644
index 0000000000..d184e76079
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/statistic_file.rs
@@ -0,0 +1,46 @@
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "kebab-case")]
+pub struct StatisticsFile {
+    /// The snapshot id of the statistics file.
+    pub snapshot_id: i64,
+    /// Path of the statistics file
+    pub statistics_path: String,
+    /// File size in bytes
+    pub file_size_in_bytes: i64,
+    /// File footer size in bytes
+    pub file_footer_size_in_bytes: i64,
+    /// Base64-encoded implementation-specific key metadata for encryption.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub key_metadata: Option<String>,
+    /// Blob metadata
+    pub blob_metadata: Vec<BlobMetadata>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "kebab-case")]
+pub struct BlobMetadata {
+    /// Type of the blob.
+    pub r#type: String,
+    /// Snapshot id of the blob.
+    pub snapshot_id: i64,
+    /// Sequence number of the blob.
+    pub sequence_number: i64,
+    /// Fields of the blob.
+    pub fields: Vec<i32>,
+    /// Properties of the blob.
+    #[serde(default, skip_serializing_if = "std::collections::HashMap::is_empty")]
+    pub properties: std::collections::HashMap<String, String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "kebab-case")]
+pub struct PartitionStatisticsFile {
+    /// The snapshot id of the statistics file.
+    pub snapshot_id: i64,
+    /// Path of the statistics file
+    pub statistics_path: String,
+    /// File size in bytes
+    pub file_size_in_bytes: i64,
+}
diff --git a/crates/sail-iceberg/src/spec/table_metadata_builder.rs b/crates/sail-iceberg/src/spec/table_metadata_builder.rs
new file mode 100644
index 0000000000..1f70a8693a
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/table_metadata_builder.rs
@@ -0,0 +1,3 @@
+#[allow(dead_code)]
+#[derive(Debug, Clone)]
+pub struct TableMetadataBuilder;
diff --git a/crates/sail-iceberg/src/spec/transform.rs b/crates/sail-iceberg/src/spec/transform.rs
index 0653f6a0ef..50571aa4c4 100644
--- a/crates/sail-iceberg/src/spec/transform.rs
+++ b/crates/sail-iceberg/src/spec/transform.rs
@@ -139,6 +139,38 @@ impl Transform {
             }
         }
     }
+
+    /// Whether the transform preserves the order of values.
+    pub fn preserves_order(&self) -> bool {
+        !matches!(
+            self,
+            Transform::Void | Transform::Bucket(_) | Transform::Unknown
+        )
+    }
+
+    /// Unique transform name to deduplicate equivalent transforms in a builder.
+    pub fn dedup_name(&self) -> String {
+        match self {
+            Transform::Year | Transform::Month | Transform::Day | Transform::Hour => {
+                "time".to_string()
+            }
+            _ => format!("{self}"),
+        }
+    }
+
+    /// Whether ordering by this transform satisfies the ordering of another transform.
+    pub fn satisfies_order_of(&self, other: &Self) -> bool {
+        match self {
+            Transform::Identity => other.preserves_order(),
+            Transform::Hour => matches!(
+                other,
+                Transform::Hour | Transform::Day | Transform::Month | Transform::Year
+            ),
+            Transform::Day => matches!(other, Transform::Day | Transform::Month | Transform::Year),
+            Transform::Month => matches!(other, Transform::Month | Transform::Year),
+            _ => self == other,
+        }
+    }
 }
 
 impl Display for Transform {
diff --git a/crates/sail-iceberg/src/spec/view_metadata.rs b/crates/sail-iceberg/src/spec/view_metadata.rs
new file mode 100644
index 0000000000..cac25bce55
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/view_metadata.rs
@@ -0,0 +1,3 @@
+#[allow(dead_code)]
+#[derive(Debug, Clone)]
+pub struct ViewMetadata;
diff --git a/crates/sail-iceberg/src/spec/view_metadata_builder.rs b/crates/sail-iceberg/src/spec/view_metadata_builder.rs
new file mode 100644
index 0000000000..62614fdbca
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/view_metadata_builder.rs
@@ -0,0 +1,3 @@
+#[allow(dead_code)]
+#[derive(Debug, Clone)]
+pub struct ViewMetadataBuilder;
diff --git a/crates/sail-iceberg/src/spec/view_version.rs b/crates/sail-iceberg/src/spec/view_version.rs
new file mode 100644
index 0000000000..816828568e
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/view_version.rs
@@ -0,0 +1,3 @@
+#[allow(dead_code)]
+#[derive(Debug, Clone)]
+pub struct ViewVersion;

From cc9c217a1c29c4f4537ead4af2f4b1678c656085 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Sun, 12 Oct 2025 11:44:45 +0800
Subject: [PATCH 09/32] update

---
 .../sail-iceberg/src/spec/manifest/_serde.rs  |  9 +--
 crates/sail-iceberg/src/spec/manifest_list.rs | 55 +++++++++----------
 crates/sail-iceberg/src/spec/mod.rs           |  2 +
 .../sail-iceberg/src/spec/name_mapping/mod.rs | 11 ++--
 crates/sail-iceberg/src/spec/schema.rs        | 11 +++-
 crates/sail-iceberg/src/spec/schema_utils.rs  | 48 ++++++++++++++++
 crates/sail-iceberg/src/spec/sort.rs          |  2 +-
 .../sail-iceberg/src/spec/table_metadata.rs   | 19 +++++--
 8 files changed, 107 insertions(+), 50 deletions(-)
 create mode 100644 crates/sail-iceberg/src/spec/schema_utils.rs

diff --git a/crates/sail-iceberg/src/spec/manifest/_serde.rs b/crates/sail-iceberg/src/spec/manifest/_serde.rs
index 5334aebc09..f4137b05f9 100644
--- a/crates/sail-iceberg/src/spec/manifest/_serde.rs
+++ b/crates/sail-iceberg/src/spec/manifest/_serde.rs
@@ -49,7 +49,7 @@ pub(super) struct DataFileAvro {
     #[serde(rename = "split_offsets")]
     pub split_offsets: Option<Vec<i64>>,
     #[serde(rename = "equality_ids")]
-    pub equality_ids: Option<Vec<i64>>,
+    pub equality_ids: Option<Vec<i32>>,
     #[serde(rename = "sort_order_id")]
     pub sort_order_id: Option<i32>,
 }
@@ -121,12 +121,7 @@ impl DataFileAvro {
             block_size_in_bytes: None,
             key_metadata: self.key_metadata,
             split_offsets: self.split_offsets.unwrap_or_default(),
-            equality_ids: self
-                .equality_ids
-                .unwrap_or_default()
-                .into_iter()
-                .map(|v| v as i32)
-                .collect(),
+            equality_ids: self.equality_ids.unwrap_or_default().into_iter().collect(),
             sort_order_id: self.sort_order_id,
             first_row_id: None,
             partition_spec_id,
diff --git a/crates/sail-iceberg/src/spec/manifest_list.rs b/crates/sail-iceberg/src/spec/manifest_list.rs
index c4ff07a837..4e629f5a3f 100644
--- a/crates/sail-iceberg/src/spec/manifest_list.rs
+++ b/crates/sail-iceberg/src/spec/manifest_list.rs
@@ -1,7 +1,6 @@
 use apache_avro::{from_value as avro_from_value, Reader as AvroReader};
 use serde::{Deserialize, Serialize};
 
-use super::values::{Literal, PrimitiveLiteral};
 use crate::spec::FormatVersion;
 
 pub const UNASSIGNED_SEQUENCE_NUMBER: i64 = -1;
@@ -87,26 +86,12 @@ pub struct FieldSummaryAvro {
 
 impl From<FieldSummaryAvro> for FieldSummary {
     fn from(summary: FieldSummaryAvro) -> Self {
-        let lower_bound = summary
-            .lower_bound
-            .and_then(|bytes| String::from_utf8(bytes).ok())
-            .map(|s| Literal::Primitive(PrimitiveLiteral::String(s)));
-
-        let upper_bound = summary
-            .upper_bound
-            .and_then(|bytes| String::from_utf8(bytes).ok())
-            .map(|s| Literal::Primitive(PrimitiveLiteral::String(s)));
-
         let mut field_summary = FieldSummary::new(summary.contains_null);
         if let Some(contains_nan) = summary.contains_nan {
             field_summary = field_summary.with_contains_nan(contains_nan);
         }
-        if let Some(lower) = lower_bound {
-            field_summary = field_summary.with_lower_bound(lower);
-        }
-        if let Some(upper) = upper_bound {
-            field_summary = field_summary.with_upper_bound(upper);
-        }
+        field_summary.lower_bound_bytes = summary.lower_bound;
+        field_summary.upper_bound_bytes = summary.upper_bound;
         field_summary
     }
 }
@@ -229,6 +214,16 @@ impl ManifestFile {
             + self.existing_rows_count.unwrap_or(0)
             + self.deleted_rows_count.unwrap_or(0)
     }
+
+    /// Whether the manifest contains any added files
+    pub fn has_added_files(&self) -> bool {
+        self.added_files_count.unwrap_or(0) > 0
+    }
+
+    /// Whether the manifest contains any deleted files
+    pub fn has_deleted_files(&self) -> bool {
+        self.deleted_files_count.unwrap_or(0) > 0
+    }
 }
 
 /// Field summary for partition fields in a manifest file.
@@ -240,12 +235,12 @@ pub struct FieldSummary {
     /// Whether the partition field contains NaN values (only for float and double).
     #[serde(skip_serializing_if = "Option::is_none")]
     pub contains_nan: Option<bool>,
-    /// The minimum value of the partition field.
+    /// The minimum value of the partition field (binary encoded per spec).
     #[serde(skip_serializing_if = "Option::is_none")]
-    pub lower_bound: Option<Literal>,
-    /// The maximum value of the partition field.
+    pub lower_bound_bytes: Option<Vec<u8>>,
+    /// The maximum value of the partition field (binary encoded per spec).
     #[serde(skip_serializing_if = "Option::is_none")]
-    pub upper_bound: Option<Literal>,
+    pub upper_bound_bytes: Option<Vec<u8>>,
 }
 
 impl FieldSummary {
@@ -254,8 +249,8 @@ impl FieldSummary {
         Self {
             contains_null,
             contains_nan: None,
-            lower_bound: None,
-            upper_bound: None,
+            lower_bound_bytes: None,
+            upper_bound_bytes: None,
         }
     }
 
@@ -266,14 +261,14 @@ impl FieldSummary {
     }
 
     /// Set the lower bound of the field.
-    pub fn with_lower_bound(mut self, lower_bound: Literal) -> Self {
-        self.lower_bound = Some(lower_bound);
+    pub fn with_lower_bound_bytes(mut self, lower: Vec<u8>) -> Self {
+        self.lower_bound_bytes = Some(lower);
         self
     }
 
     /// Set the upper bound of the field.
-    pub fn with_upper_bound(mut self, upper_bound: Literal) -> Self {
-        self.upper_bound = Some(upper_bound);
+    pub fn with_upper_bound_bytes(mut self, upper: Vec<u8>) -> Self {
+        self.upper_bound_bytes = Some(upper);
         self
     }
 }
@@ -471,11 +466,11 @@ pub(super) mod _serde {
         pub min_sequence_number: i64,
         #[serde(rename = "added_snapshot_id")]
         pub added_snapshot_id: i64,
-        #[serde(rename = "added_files_count")]
+        #[serde(rename = "added_files_count", alias = "added_data_files_count")]
         pub added_files_count: i32,
-        #[serde(rename = "existing_files_count")]
+        #[serde(rename = "existing_files_count", alias = "existing_data_files_count")]
         pub existing_files_count: i32,
-        #[serde(rename = "deleted_files_count")]
+        #[serde(rename = "deleted_files_count", alias = "deleted_data_files_count")]
         pub deleted_files_count: i32,
         #[serde(rename = "added_rows_count")]
         pub added_rows_count: i64,
diff --git a/crates/sail-iceberg/src/spec/mod.rs b/crates/sail-iceberg/src/spec/mod.rs
index c9193d44bd..2910d8e13b 100644
--- a/crates/sail-iceberg/src/spec/mod.rs
+++ b/crates/sail-iceberg/src/spec/mod.rs
@@ -37,3 +37,5 @@ pub use values::*;
 pub use view_metadata::*;
 pub use view_metadata_builder::*;
 pub use view_version::*;
+
+pub mod schema_utils;
diff --git a/crates/sail-iceberg/src/spec/name_mapping/mod.rs b/crates/sail-iceberg/src/spec/name_mapping/mod.rs
index 1656196d10..41ce091d31 100644
--- a/crates/sail-iceberg/src/spec/name_mapping/mod.rs
+++ b/crates/sail-iceberg/src/spec/name_mapping/mod.rs
@@ -6,21 +6,18 @@ use serde::{Deserialize, Serialize};
 pub const DEFAULT_SCHEMA_NAME_MAPPING: &str = "schema.name-mapping.default";
 
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
-#[serde(rename_all = "kebab-case")]
-pub struct NameMapping {
-    #[serde(rename = "fields")]
-    root: Vec<MappedField>,
-}
+#[serde(transparent)]
+pub struct NameMapping(Vec<MappedField>);
 
 impl NameMapping {
     /// Create a new `NameMapping` given mapped fields.
     pub fn new(fields: Vec<MappedField>) -> Self {
-        Self { root: fields }
+        Self(fields)
     }
 
     /// Returns mapped fields
     pub fn fields(&self) -> &[MappedField] {
-        &self.root
+        &self.0
     }
 }
 
diff --git a/crates/sail-iceberg/src/spec/schema.rs b/crates/sail-iceberg/src/spec/schema.rs
index f3651d7896..ec90d4dc30 100644
--- a/crates/sail-iceberg/src/spec/schema.rs
+++ b/crates/sail-iceberg/src/spec/schema.rs
@@ -63,12 +63,21 @@ struct SchemaData {
     identifier_field_ids: Option<Vec<i32>>,
 }
 
+#[derive(Deserialize)]
+#[serde(untagged)]
+enum SchemaEnum {
+    V1(SchemaData),
+    V2(SchemaData),
+}
+
 impl<'de> Deserialize<'de> for Schema {
     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
     where
         D: serde::Deserializer<'de>,
     {
-        let data = SchemaData::deserialize(deserializer)?;
+        let data = match SchemaEnum::deserialize(deserializer)? {
+            SchemaEnum::V1(d) | SchemaEnum::V2(d) => d,
+        };
 
         let struct_type = StructType::new(data.fields.clone());
         let mut id_to_field = HashMap::new();
diff --git a/crates/sail-iceberg/src/spec/schema_utils.rs b/crates/sail-iceberg/src/spec/schema_utils.rs
new file mode 100644
index 0000000000..4cd1cb5bce
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/schema_utils.rs
@@ -0,0 +1,48 @@
+use std::collections::{HashMap, HashSet, VecDeque};
+
+use crate::spec::{NestedFieldRef, Schema, Type};
+
+/// Visit all fields in a schema in breadth-first order, calling the callback for each field id.
+pub fn visit_fields_bfs<F: FnMut(i32, &NestedFieldRef)>(schema: &Schema, mut f: F) {
+    let mut queue: VecDeque<NestedFieldRef> = VecDeque::new();
+    for field in schema.fields() {
+        queue.push_back(field.clone());
+    }
+
+    while let Some(field) = queue.pop_front() {
+        f(field.id, &field);
+        match field.field_type.as_ref() {
+            Type::Struct(s) => {
+                for child in s.fields() {
+                    queue.push_back(child.clone());
+                }
+            }
+            Type::List(l) => queue.push_back(l.element_field.clone()),
+            Type::Map(m) => {
+                queue.push_back(m.key_field.clone());
+                queue.push_back(m.value_field.clone());
+            }
+            _ => {}
+        }
+    }
+}
+
+/// Prune a schema by keeping only fields whose ids are included.
+/// Ancestor container fields are preserved automatically.
+pub fn prune_schema_by_field_ids(schema: &Schema, keep_ids: &HashSet<i32>) -> Vec<NestedFieldRef> {
+    let mut kept: HashMap<i32, NestedFieldRef> = HashMap::new();
+
+    visit_fields_bfs(schema, |id, field| {
+        if keep_ids.contains(&id) {
+            kept.insert(id, field.clone());
+        }
+    });
+
+    // Return only top-level fields that are kept; children are implicitly reachable by id
+    schema
+        .fields()
+        .iter()
+        .filter(|f| kept.contains_key(&f.id))
+        .cloned()
+        .collect()
+}
diff --git a/crates/sail-iceberg/src/spec/sort.rs b/crates/sail-iceberg/src/spec/sort.rs
index e7f19da384..5c4c90b2db 100644
--- a/crates/sail-iceberg/src/spec/sort.rs
+++ b/crates/sail-iceberg/src/spec/sort.rs
@@ -71,7 +71,7 @@ impl Display for SortField {
 pub struct SortOrder {
     /// Identifier for SortOrder, order_id `0` is no sort order.
     #[serde(default)]
-    pub order_id: i32,
+    pub order_id: i64,
     /// Details of the sort
     #[serde(default)]
     pub fields: Vec<SortField>,
diff --git a/crates/sail-iceberg/src/spec/table_metadata.rs b/crates/sail-iceberg/src/spec/table_metadata.rs
index ef27068393..f7d901bf5a 100644
--- a/crates/sail-iceberg/src/spec/table_metadata.rs
+++ b/crates/sail-iceberg/src/spec/table_metadata.rs
@@ -89,6 +89,13 @@ pub struct MetadataLog {
     pub metadata_file: String,
 }
 
+#[derive(Deserialize)]
+#[serde(untagged)]
+enum TableMetadataEnum {
+    V1(TableMetadata),
+    V2(TableMetadata),
+}
+
 impl TableMetadata {
     /// Get the current schema
     pub fn current_schema(&self) -> Option<&Schema> {
@@ -144,10 +151,14 @@ impl TableMetadata {
                 }
 
                 log::debug!("[ICEBERG] Deserializing to TableMetadata struct");
-                serde_json::from_value::<TableMetadata>(json_value).map_err(|e| {
-                    log::error!("[ICEBERG] Failed to deserialize TableMetadata: {:?}", e);
-                    e
-                })
+                serde_json::from_value::<TableMetadataEnum>(json_value)
+                    .map_err(|e| {
+                        log::error!("[ICEBERG] Failed to deserialize TableMetadata: {:?}", e);
+                        e
+                    })
+                    .map(|tm| match tm {
+                        TableMetadataEnum::V1(t) | TableMetadataEnum::V2(t) => t,
+                    })
             }
             Err(e) => {
                 log::error!("[ICEBERG] Failed to parse as JSON: {:?}", e);

From a6adbb33ce9650fbcb01cc16d7dc1bfb4bdc4977 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Sun, 12 Oct 2025 12:10:37 +0800
Subject: [PATCH 10/32] catalog spec

---
 .../src/spec/catalog/metadata_location.rs     |  73 ++++
 crates/sail-iceberg/src/spec/catalog/mod.rs   | 381 ++++++++++++++++++
 crates/sail-iceberg/src/spec/mod.rs           |   6 +-
 .../src/spec/partition_unbound.rs             |  17 +
 4 files changed, 475 insertions(+), 2 deletions(-)
 create mode 100644 crates/sail-iceberg/src/spec/catalog/metadata_location.rs
 create mode 100644 crates/sail-iceberg/src/spec/catalog/mod.rs
 create mode 100644 crates/sail-iceberg/src/spec/partition_unbound.rs

diff --git a/crates/sail-iceberg/src/spec/catalog/metadata_location.rs b/crates/sail-iceberg/src/spec/catalog/metadata_location.rs
new file mode 100644
index 0000000000..bfb59ca1dd
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/catalog/metadata_location.rs
@@ -0,0 +1,73 @@
+use std::fmt::Display;
+use std::str::FromStr;
+
+use uuid::Uuid;
+
+#[derive(Clone, Debug, PartialEq)]
+pub struct MetadataLocation {
+    table_location: String,
+    version: i32,
+    id: Uuid,
+}
+
+impl MetadataLocation {
+    pub fn new_with_table_location(table_location: impl ToString) -> Self {
+        Self {
+            table_location: table_location.to_string(),
+            version: 0,
+            id: Uuid::new_v4(),
+        }
+    }
+    pub fn with_next_version(&self) -> Self {
+        Self {
+            table_location: self.table_location.clone(),
+            version: self.version + 1,
+            id: Uuid::new_v4(),
+        }
+    }
+    fn parse_metadata_path_prefix(path: &str) -> Result<String, String> {
+        let prefix = path.strip_suffix("/metadata").ok_or_else(|| {
+            format!(
+                "Metadata location not under \"/metadata\" subdirectory: {}",
+                path
+            )
+        })?;
+        Ok(prefix.to_string())
+    }
+    fn parse_file_name(file_name: &str) -> Result<(i32, Uuid), String> {
+        let (version, id) = file_name
+            .strip_suffix(".metadata.json")
+            .ok_or_else(|| format!("Invalid metadata file ending: {}", file_name))?
+            .split_once('-')
+            .ok_or_else(|| format!("Invalid metadata file name format: {}", file_name))?;
+        let v = version.parse::<i32>().map_err(|e| e.to_string())?;
+        let u = Uuid::parse_str(id).map_err(|e| e.to_string())?;
+        Ok((v, u))
+    }
+}
+
+impl Display for MetadataLocation {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}/metadata/{:0>5}-{}.metadata.json",
+            self.table_location, self.version, self.id
+        )
+    }
+}
+
+impl FromStr for MetadataLocation {
+    type Err = String;
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let (path, file_name) = s
+            .rsplit_once('/')
+            .ok_or_else(|| format!("Invalid metadata location: {}", s))?;
+        let prefix = Self::parse_metadata_path_prefix(path)?;
+        let (version, id) = Self::parse_file_name(file_name)?;
+        Ok(Self {
+            table_location: prefix,
+            version,
+            id,
+        })
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/catalog/mod.rs b/crates/sail-iceberg/src/spec/catalog/mod.rs
new file mode 100644
index 0000000000..1f52a24ff7
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/catalog/mod.rs
@@ -0,0 +1,381 @@
+use std::collections::HashMap;
+use std::fmt::Display;
+
+use serde::{Deserialize, Serialize};
+use uuid::Uuid;
+
+use crate::spec::partition_unbound::UnboundPartitionSpec;
+use crate::spec::{
+    FormatVersion, PartitionStatisticsFile, Schema, SchemaId, Snapshot, SnapshotReference,
+    SortOrder, StatisticsFile,
+};
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct NamespaceIdent(Vec<String>);
+
+impl NamespaceIdent {
+    pub fn new(name: String) -> Self {
+        Self(vec![name])
+    }
+    pub fn from_vec(names: Vec<String>) -> Self {
+        Self(names)
+    }
+    pub fn inner(self) -> Vec<String> {
+        self.0
+    }
+}
+
+impl AsRef<Vec<String>> for NamespaceIdent {
+    fn as_ref(&self) -> &Vec<String> {
+        &self.0
+    }
+}
+
+impl Display for NamespaceIdent {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0.join("."))
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Namespace {
+    name: NamespaceIdent,
+    properties: HashMap<String, String>,
+}
+
+impl Namespace {
+    pub fn new(name: NamespaceIdent) -> Self {
+        Self {
+            name,
+            properties: HashMap::new(),
+        }
+    }
+    pub fn with_properties(name: NamespaceIdent, properties: HashMap<String, String>) -> Self {
+        Self { name, properties }
+    }
+    pub fn name(&self) -> &NamespaceIdent {
+        &self.name
+    }
+    pub fn properties(&self) -> &HashMap<String, String> {
+        &self.properties
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct TableIdent {
+    pub namespace: NamespaceIdent,
+    pub name: String,
+}
+
+impl TableIdent {
+    pub fn new(namespace: NamespaceIdent, name: String) -> Self {
+        Self { namespace, name }
+    }
+    pub fn namespace(&self) -> &NamespaceIdent {
+        &self.namespace
+    }
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+}
+
+impl Display for TableIdent {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}.{}", self.namespace, self.name)
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub struct TableCreation {
+    pub name: String,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub location: Option<String>,
+    pub schema: Schema,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub partition_spec: Option<UnboundPartitionSpec>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub sort_order: Option<SortOrder>,
+    #[serde(default)]
+    pub properties: HashMap<String, String>,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[serde(tag = "type")]
+pub enum TableRequirement {
+    #[serde(rename = "assert-create")]
+    NotExist,
+    #[serde(rename = "assert-table-uuid")]
+    UuidMatch { uuid: Uuid },
+    #[serde(rename = "assert-ref-snapshot-id")]
+    RefSnapshotIdMatch {
+        r#ref: String,
+        #[serde(rename = "snapshot-id")]
+        snapshot_id: Option<i64>,
+    },
+    #[serde(rename = "assert-last-assigned-field-id")]
+    LastAssignedFieldIdMatch {
+        #[serde(rename = "last-assigned-field-id")]
+        last_assigned_field_id: i32,
+    },
+    #[serde(rename = "assert-current-schema-id")]
+    CurrentSchemaIdMatch {
+        #[serde(rename = "current-schema-id")]
+        current_schema_id: SchemaId,
+    },
+    #[serde(rename = "assert-last-assigned-partition-id")]
+    LastAssignedPartitionIdMatch {
+        #[serde(rename = "last-assigned-partition-id")]
+        last_assigned_partition_id: i32,
+    },
+    #[serde(rename = "assert-default-spec-id")]
+    DefaultSpecIdMatch {
+        #[serde(rename = "default-spec-id")]
+        default_spec_id: i32,
+    },
+    #[serde(rename = "assert-default-sort-order-id")]
+    DefaultSortOrderIdMatch {
+        #[serde(rename = "default-sort-order-id")]
+        default_sort_order_id: i64,
+    },
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)]
+#[serde(tag = "action", rename_all = "kebab-case")]
+pub enum TableUpdate {
+    UpgradeFormatVersion {
+        format_version: FormatVersion,
+    },
+    AssignUuid {
+        uuid: Uuid,
+    },
+    AddSchema {
+        schema: Box<Schema>,
+    },
+    SetCurrentSchema {
+        #[serde(rename = "schema-id")]
+        schema_id: i32,
+    },
+    AddSpec {
+        spec: UnboundPartitionSpec,
+    },
+    SetDefaultSpec {
+        #[serde(rename = "spec-id")]
+        spec_id: i32,
+    },
+    AddSortOrder {
+        sort_order: SortOrder,
+    },
+    SetDefaultSortOrder {
+        #[serde(rename = "sort-order-id")]
+        sort_order_id: i64,
+    },
+    AddSnapshot {
+        #[serde(deserialize_with = "crate::spec::catalog::_serde::deserialize_snapshot")]
+        snapshot: Snapshot,
+    },
+    SetSnapshotRef {
+        #[serde(rename = "ref-name")]
+        ref_name: String,
+        #[serde(flatten)]
+        reference: SnapshotReference,
+    },
+    RemoveSnapshots {
+        #[serde(rename = "snapshot-ids")]
+        snapshot_ids: Vec<i64>,
+    },
+    RemoveSnapshotRef {
+        #[serde(rename = "ref-name")]
+        ref_name: String,
+    },
+    SetLocation {
+        location: String,
+    },
+    SetProperties {
+        updates: HashMap<String, String>,
+    },
+    RemoveProperties {
+        removals: Vec<String>,
+    },
+    RemovePartitionSpecs {
+        #[serde(rename = "spec-ids")]
+        spec_ids: Vec<i32>,
+    },
+    #[serde(with = "_serde_set_statistics")]
+    SetStatistics {
+        statistics: StatisticsFile,
+    },
+    RemoveStatistics {
+        #[serde(rename = "snapshot-id")]
+        snapshot_id: i64,
+    },
+    SetPartitionStatistics {
+        partition_statistics: PartitionStatisticsFile,
+    },
+    RemovePartitionStatistics {
+        #[serde(rename = "snapshot-id")]
+        snapshot_id: i64,
+    },
+    RemoveSchemas {
+        #[serde(rename = "schema-ids")]
+        schema_ids: Vec<i32>,
+    },
+}
+
+pub(super) mod _serde {
+    use serde::{Deserialize as _, Deserializer};
+
+    use super::*;
+    pub(super) fn deserialize_snapshot<'de, D>(
+        deserializer: D,
+    ) -> std::result::Result<Snapshot, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let buf = CatalogSnapshot::deserialize(deserializer)?;
+        Snapshot::try_from(buf).map_err(serde::de::Error::custom)
+    }
+    #[derive(Debug, Deserialize, PartialEq, Eq)]
+    #[serde(rename_all = "kebab-case")]
+    struct CatalogSnapshot {
+        snapshot_id: i64,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        parent_snapshot_id: Option<i64>,
+        #[serde(default)]
+        sequence_number: i64,
+        timestamp_ms: i64,
+        manifest_list: String,
+        summary: super::super::snapshot::Summary,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        schema_id: Option<SchemaId>,
+    }
+    impl TryFrom<CatalogSnapshot> for Snapshot {
+        type Error = String;
+        fn try_from(snapshot: CatalogSnapshot) -> Result<Self, Self::Error> {
+            let mut builder = Snapshot::builder()
+                .with_snapshot_id(snapshot.snapshot_id)
+                .with_sequence_number(snapshot.sequence_number)
+                .with_timestamp_ms(snapshot.timestamp_ms)
+                .with_manifest_list(snapshot.manifest_list)
+                .with_summary(snapshot.summary);
+            if let Some(parent) = snapshot.parent_snapshot_id {
+                builder = builder.with_parent_snapshot_id(parent);
+            }
+            if let Some(schema_id) = snapshot.schema_id {
+                builder = builder.with_schema_id(schema_id);
+            }
+            builder.build()
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum ViewFormatVersion {
+    #[serde(rename = "1")]
+    V1 = 1,
+}
+
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "kebab-case")]
+pub enum ViewRepresentation {
+    #[serde(rename_all = "kebab-case")]
+    Sql {
+        sql: String,
+        #[serde(default)]
+        dialect: String,
+    },
+}
+
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct ViewRepresentations(pub Vec<ViewRepresentation>);
+
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub struct ViewVersion {
+    #[serde(rename = "version-id")]
+    pub version_id: i32,
+    #[serde(rename = "timestamp-ms")]
+    pub timestamp_ms: i64,
+    #[serde(rename = "schema-id")]
+    pub schema_id: SchemaId,
+    #[serde(default, rename = "default-catalog")]
+    pub default_catalog: Option<String>,
+    #[serde(rename = "default-namespace")]
+    pub default_namespace: NamespaceIdent,
+    #[serde(default)]
+    pub summary: HashMap<String, String>,
+    pub representations: ViewRepresentations,
+}
+
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+#[serde(tag = "action", rename_all = "kebab-case")]
+pub enum ViewUpdate {
+    AssignUuid {
+        uuid: Uuid,
+    },
+    UpgradeFormatVersion {
+        format_version: ViewFormatVersion,
+    },
+    AddSchema {
+        schema: Box<Schema>,
+        #[serde(rename = "last-column-id", skip_serializing_if = "Option::is_none")]
+        last_column_id: Option<i32>,
+    },
+    SetLocation {
+        location: String,
+    },
+    SetProperties {
+        updates: HashMap<String, String>,
+    },
+    RemoveProperties {
+        removals: Vec<String>,
+    },
+    AddViewVersion {
+        #[serde(rename = "view-version")]
+        view_version: ViewVersion,
+    },
+    SetCurrentViewVersion {
+        #[serde(rename = "view-version-id")]
+        view_version_id: i32,
+    },
+}
+
+mod _serde_set_statistics {
+    use serde::{Deserialize, Deserializer, Serialize, Serializer};
+
+    use super::*;
+    #[derive(Debug, Serialize, Deserialize)]
+    #[serde(rename_all = "kebab-case")]
+    struct SetStatistics {
+        snapshot_id: Option<i64>,
+        statistics: StatisticsFile,
+    }
+    pub fn serialize<S>(
+        value: &StatisticsFile,
+        serializer: S,
+    ) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        SetStatistics {
+            snapshot_id: Some(value.snapshot_id),
+            statistics: value.clone(),
+        }
+        .serialize(serializer)
+    }
+    pub fn deserialize<'de, D>(deserializer: D) -> std::result::Result<StatisticsFile, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let SetStatistics {
+            snapshot_id,
+            statistics,
+        } = SetStatistics::deserialize(deserializer)?;
+        if let Some(snapshot_id) = snapshot_id {
+            if snapshot_id != statistics.snapshot_id {
+                return Err(serde::de::Error::custom(format!("Snapshot id to set {snapshot_id} does not match the statistics file snapshot id {}", statistics.snapshot_id)));
+            }
+        }
+        Ok(statistics)
+    }
+}
diff --git a/crates/sail-iceberg/src/spec/mod.rs b/crates/sail-iceberg/src/spec/mod.rs
index 2910d8e13b..6643431eba 100644
--- a/crates/sail-iceberg/src/spec/mod.rs
+++ b/crates/sail-iceberg/src/spec/mod.rs
@@ -1,3 +1,4 @@
+pub mod catalog;
 pub mod datatypes;
 pub mod encrypted_key;
 pub mod format;
@@ -5,6 +6,7 @@ pub mod manifest;
 pub mod manifest_list;
 pub mod name_mapping;
 pub mod partition;
+pub mod partition_unbound;
 pub mod schema;
 pub mod snapshot;
 pub mod snapshot_summary;
@@ -16,8 +18,8 @@ pub mod transform;
 pub mod values;
 pub mod view_metadata;
 pub mod view_metadata_builder;
-pub mod view_version;
 
+pub use catalog::*;
 pub use datatypes::*;
 pub use encrypted_key::*;
 pub use format::*;
@@ -25,6 +27,7 @@ pub use manifest::*;
 pub use manifest_list::*;
 pub use name_mapping::*;
 pub use partition::*;
+pub use partition_unbound::*;
 pub use schema::*;
 pub use snapshot::*;
 pub use snapshot_summary::*;
@@ -36,6 +39,5 @@ pub use transform::*;
 pub use values::*;
 pub use view_metadata::*;
 pub use view_metadata_builder::*;
-pub use view_version::*;
 
 pub mod schema_utils;
diff --git a/crates/sail-iceberg/src/spec/partition_unbound.rs b/crates/sail-iceberg/src/spec/partition_unbound.rs
new file mode 100644
index 0000000000..bc4beacfa7
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/partition_unbound.rs
@@ -0,0 +1,17 @@
+use serde::{Deserialize, Serialize};
+
+use crate::spec::transform::Transform;
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "kebab-case")]
+pub struct UnboundPartitionField {
+    pub source_id: i32,
+    pub name: String,
+    pub transform: Transform,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "kebab-case")]
+pub struct UnboundPartitionSpec {
+    pub fields: Vec<UnboundPartitionField>,
+}

From 65eedc3fefaae20508b8a5fd0c4d236b99395bfb Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Sun, 12 Oct 2025 13:03:27 +0800
Subject: [PATCH 11/32] comments

---
 .../src/spec/catalog/metadata_location.rs     | 19 ++++++++
 crates/sail-iceberg/src/spec/catalog/mod.rs   | 21 +++++++++
 crates/sail-iceberg/src/spec/datatypes.rs     | 19 ++++++++
 crates/sail-iceberg/src/spec/encrypted_key.rs | 19 ++++++++
 crates/sail-iceberg/src/spec/format.rs        | 19 ++++++++
 .../sail-iceberg/src/spec/manifest/_serde.rs  | 19 ++++++++
 .../src/spec/manifest/data_file.rs            | 19 ++++++++
 .../sail-iceberg/src/spec/manifest/entry.rs   | 19 ++++++++
 .../src/spec/manifest/metadata.rs             | 19 ++++++++
 crates/sail-iceberg/src/spec/manifest/mod.rs  | 19 ++++++++
 .../sail-iceberg/src/spec/manifest/writer.rs  | 19 ++++++++
 crates/sail-iceberg/src/spec/manifest_list.rs | 19 ++++++++
 crates/sail-iceberg/src/spec/mod.rs           | 19 ++++++++
 .../sail-iceberg/src/spec/name_mapping/mod.rs | 19 ++++++++
 crates/sail-iceberg/src/spec/partition.rs     | 19 ++++++++
 .../src/spec/partition_unbound.rs             | 19 ++++++++
 crates/sail-iceberg/src/spec/schema.rs        | 19 ++++++++
 crates/sail-iceberg/src/spec/schema_utils.rs  | 19 ++++++++
 crates/sail-iceberg/src/spec/snapshot.rs      | 19 ++++++++
 .../sail-iceberg/src/spec/snapshot_summary.rs | 19 ++++++++
 crates/sail-iceberg/src/spec/sort.rs          | 19 ++++++++
 .../sail-iceberg/src/spec/statistic_file.rs   | 19 ++++++++
 crates/sail-iceberg/src/spec/statistics.rs    | 46 -------------------
 .../sail-iceberg/src/spec/table_metadata.rs   | 19 ++++++++
 .../src/spec/table_metadata_builder.rs        | 19 ++++++++
 crates/sail-iceberg/src/spec/transform.rs     | 19 ++++++++
 crates/sail-iceberg/src/spec/values.rs        | 19 ++++++++
 crates/sail-iceberg/src/spec/view_metadata.rs | 19 ++++++++
 .../src/spec/view_metadata_builder.rs         | 19 ++++++++
 crates/sail-iceberg/src/spec/view_version.rs  | 19 ++++++++
 30 files changed, 553 insertions(+), 46 deletions(-)
 delete mode 100644 crates/sail-iceberg/src/spec/statistics.rs

diff --git a/crates/sail-iceberg/src/spec/catalog/metadata_location.rs b/crates/sail-iceberg/src/spec/catalog/metadata_location.rs
index bfb59ca1dd..cd6c69dc58 100644
--- a/crates/sail-iceberg/src/spec/catalog/metadata_location.rs
+++ b/crates/sail-iceberg/src/spec/catalog/metadata_location.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/catalog/metadata_location.rs
+
 use std::fmt::Display;
 use std::str::FromStr;
 
diff --git a/crates/sail-iceberg/src/spec/catalog/mod.rs b/crates/sail-iceberg/src/spec/catalog/mod.rs
index 1f52a24ff7..e7d31ed792 100644
--- a/crates/sail-iceberg/src/spec/catalog/mod.rs
+++ b/crates/sail-iceberg/src/spec/catalog/mod.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/catalog/mod.rs
+
 use std::collections::HashMap;
 use std::fmt::Display;
 
@@ -10,6 +29,8 @@ use crate::spec::{
     SortOrder, StatisticsFile,
 };
 
+pub mod metadata_location;
+
 #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
 pub struct NamespaceIdent(Vec<String>);
 
diff --git a/crates/sail-iceberg/src/spec/datatypes.rs b/crates/sail-iceberg/src/spec/datatypes.rs
index d97c21f1bb..88457500f8 100644
--- a/crates/sail-iceberg/src/spec/datatypes.rs
+++ b/crates/sail-iceberg/src/spec/datatypes.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/datatypes.rs
+
 use std::collections::HashMap;
 use std::fmt;
 use std::ops::Index;
diff --git a/crates/sail-iceberg/src/spec/encrypted_key.rs b/crates/sail-iceberg/src/spec/encrypted_key.rs
index 498e72b11a..3e8d25721d 100644
--- a/crates/sail-iceberg/src/spec/encrypted_key.rs
+++ b/crates/sail-iceberg/src/spec/encrypted_key.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/encrypted_key.rs
+
 // Awareness stub for non-read path
 #[allow(dead_code)]
 #[derive(Debug, Clone)]
diff --git a/crates/sail-iceberg/src/spec/format.rs b/crates/sail-iceberg/src/spec/format.rs
index 955ddaebed..3ed4848284 100644
--- a/crates/sail-iceberg/src/spec/format.rs
+++ b/crates/sail-iceberg/src/spec/format.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/table_metadata.rs
+
 /// Format version of Iceberg.
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub enum FormatVersion {
diff --git a/crates/sail-iceberg/src/spec/manifest/_serde.rs b/crates/sail-iceberg/src/spec/manifest/_serde.rs
index f4137b05f9..deab8caa74 100644
--- a/crates/sail-iceberg/src/spec/manifest/_serde.rs
+++ b/crates/sail-iceberg/src/spec/manifest/_serde.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/manifest/_serde.rs
+
 use apache_avro::types::Value as AvroValue;
 use serde::{Deserialize, Serialize};
 
diff --git a/crates/sail-iceberg/src/spec/manifest/data_file.rs b/crates/sail-iceberg/src/spec/manifest/data_file.rs
index 0de5adb645..83333e597f 100644
--- a/crates/sail-iceberg/src/spec/manifest/data_file.rs
+++ b/crates/sail-iceberg/src/spec/manifest/data_file.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/manifest/data_file.rs
+
 use std::collections::HashMap;
 
 use serde::{Deserialize, Serialize};
diff --git a/crates/sail-iceberg/src/spec/manifest/entry.rs b/crates/sail-iceberg/src/spec/manifest/entry.rs
index 47b6c484ae..449b7111d4 100644
--- a/crates/sail-iceberg/src/spec/manifest/entry.rs
+++ b/crates/sail-iceberg/src/spec/manifest/entry.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/manifest/entry.rs
+
 use serde::{Deserialize, Serialize};
 
 use super::DataFile;
diff --git a/crates/sail-iceberg/src/spec/manifest/metadata.rs b/crates/sail-iceberg/src/spec/manifest/metadata.rs
index 2333355fb2..3f24f526d5 100644
--- a/crates/sail-iceberg/src/spec/manifest/metadata.rs
+++ b/crates/sail-iceberg/src/spec/manifest/metadata.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/manifest/metadata.rs
+
 use serde::{Deserialize, Serialize};
 
 use crate::spec::{
diff --git a/crates/sail-iceberg/src/spec/manifest/mod.rs b/crates/sail-iceberg/src/spec/manifest/mod.rs
index ef61ba5c5d..ed328b66b0 100644
--- a/crates/sail-iceberg/src/spec/manifest/mod.rs
+++ b/crates/sail-iceberg/src/spec/manifest/mod.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/manifest/mod.rs
+
 use std::sync::Arc;
 
 use apache_avro::{from_value as avro_from_value, Reader as AvroReader};
diff --git a/crates/sail-iceberg/src/spec/manifest/writer.rs b/crates/sail-iceberg/src/spec/manifest/writer.rs
index 22ebe298b0..cabca8869b 100644
--- a/crates/sail-iceberg/src/spec/manifest/writer.rs
+++ b/crates/sail-iceberg/src/spec/manifest/writer.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/manifest/writer.rs
+
 // Awareness stub for non-read path
 // TODO: Implement writer support if/when write path is added.
 #[allow(dead_code)]
diff --git a/crates/sail-iceberg/src/spec/manifest_list.rs b/crates/sail-iceberg/src/spec/manifest_list.rs
index 4e629f5a3f..699fe78819 100644
--- a/crates/sail-iceberg/src/spec/manifest_list.rs
+++ b/crates/sail-iceberg/src/spec/manifest_list.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/manifest_list.rs
+
 use apache_avro::{from_value as avro_from_value, Reader as AvroReader};
 use serde::{Deserialize, Serialize};
 
diff --git a/crates/sail-iceberg/src/spec/mod.rs b/crates/sail-iceberg/src/spec/mod.rs
index 6643431eba..11102ff3d2 100644
--- a/crates/sail-iceberg/src/spec/mod.rs
+++ b/crates/sail-iceberg/src/spec/mod.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/mod.rs
+
 pub mod catalog;
 pub mod datatypes;
 pub mod encrypted_key;
diff --git a/crates/sail-iceberg/src/spec/name_mapping/mod.rs b/crates/sail-iceberg/src/spec/name_mapping/mod.rs
index 41ce091d31..b9bd9258a3 100644
--- a/crates/sail-iceberg/src/spec/name_mapping/mod.rs
+++ b/crates/sail-iceberg/src/spec/name_mapping/mod.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/name_mapping/mod.rs
+
 use std::sync::Arc;
 
 use serde::{Deserialize, Serialize};
diff --git a/crates/sail-iceberg/src/spec/partition.rs b/crates/sail-iceberg/src/spec/partition.rs
index 07ffcf022d..3af426e70d 100644
--- a/crates/sail-iceberg/src/spec/partition.rs
+++ b/crates/sail-iceberg/src/spec/partition.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/partition.rs
+
 use std::sync::Arc;
 
 use serde::{Deserialize, Serialize};
diff --git a/crates/sail-iceberg/src/spec/partition_unbound.rs b/crates/sail-iceberg/src/spec/partition_unbound.rs
index bc4beacfa7..884da089c2 100644
--- a/crates/sail-iceberg/src/spec/partition_unbound.rs
+++ b/crates/sail-iceberg/src/spec/partition_unbound.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/partition.rs
+
 use serde::{Deserialize, Serialize};
 
 use crate::spec::transform::Transform;
diff --git a/crates/sail-iceberg/src/spec/schema.rs b/crates/sail-iceberg/src/spec/schema.rs
index ec90d4dc30..ab7211b47a 100644
--- a/crates/sail-iceberg/src/spec/schema.rs
+++ b/crates/sail-iceberg/src/spec/schema.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/schema/mod.rs
+
 use std::collections::{HashMap, HashSet};
 use std::fmt::{Display, Formatter};
 use std::sync::Arc;
diff --git a/crates/sail-iceberg/src/spec/schema_utils.rs b/crates/sail-iceberg/src/spec/schema_utils.rs
index 4cd1cb5bce..79523136e4 100644
--- a/crates/sail-iceberg/src/spec/schema_utils.rs
+++ b/crates/sail-iceberg/src/spec/schema_utils.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/schema/utils.rs
+
 use std::collections::{HashMap, HashSet, VecDeque};
 
 use crate::spec::{NestedFieldRef, Schema, Type};
diff --git a/crates/sail-iceberg/src/spec/snapshot.rs b/crates/sail-iceberg/src/spec/snapshot.rs
index 84a44749b7..95e9883118 100644
--- a/crates/sail-iceberg/src/spec/snapshot.rs
+++ b/crates/sail-iceberg/src/spec/snapshot.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/snapshot.rs
+
 use std::collections::HashMap;
 use std::sync::Arc;
 
diff --git a/crates/sail-iceberg/src/spec/snapshot_summary.rs b/crates/sail-iceberg/src/spec/snapshot_summary.rs
index ac8fc8de4e..57e877038b 100644
--- a/crates/sail-iceberg/src/spec/snapshot_summary.rs
+++ b/crates/sail-iceberg/src/spec/snapshot_summary.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/snapshot_summary.rs
+
 // Awareness stub for read-path. Extend later if needed.
 #[allow(dead_code)]
 #[derive(Debug, Clone)]
diff --git a/crates/sail-iceberg/src/spec/sort.rs b/crates/sail-iceberg/src/spec/sort.rs
index 5c4c90b2db..d507d8ae7f 100644
--- a/crates/sail-iceberg/src/spec/sort.rs
+++ b/crates/sail-iceberg/src/spec/sort.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/sort.rs
+
 use std::fmt::{Display, Formatter};
 
 use serde::{Deserialize, Serialize};
diff --git a/crates/sail-iceberg/src/spec/statistic_file.rs b/crates/sail-iceberg/src/spec/statistic_file.rs
index d184e76079..84f4eb7d84 100644
--- a/crates/sail-iceberg/src/spec/statistic_file.rs
+++ b/crates/sail-iceberg/src/spec/statistic_file.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/statistic_file.rs
+
 use serde::{Deserialize, Serialize};
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
diff --git a/crates/sail-iceberg/src/spec/statistics.rs b/crates/sail-iceberg/src/spec/statistics.rs
deleted file mode 100644
index d184e76079..0000000000
--- a/crates/sail-iceberg/src/spec/statistics.rs
+++ /dev/null
@@ -1,46 +0,0 @@
-use serde::{Deserialize, Serialize};
-
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "kebab-case")]
-pub struct StatisticsFile {
-    /// The snapshot id of the statistics file.
-    pub snapshot_id: i64,
-    /// Path of the statistics file
-    pub statistics_path: String,
-    /// File size in bytes
-    pub file_size_in_bytes: i64,
-    /// File footer size in bytes
-    pub file_footer_size_in_bytes: i64,
-    /// Base64-encoded implementation-specific key metadata for encryption.
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub key_metadata: Option<String>,
-    /// Blob metadata
-    pub blob_metadata: Vec<BlobMetadata>,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "kebab-case")]
-pub struct BlobMetadata {
-    /// Type of the blob.
-    pub r#type: String,
-    /// Snapshot id of the blob.
-    pub snapshot_id: i64,
-    /// Sequence number of the blob.
-    pub sequence_number: i64,
-    /// Fields of the blob.
-    pub fields: Vec<i32>,
-    /// Properties of the blob.
-    #[serde(default, skip_serializing_if = "std::collections::HashMap::is_empty")]
-    pub properties: std::collections::HashMap<String, String>,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "kebab-case")]
-pub struct PartitionStatisticsFile {
-    /// The snapshot id of the statistics file.
-    pub snapshot_id: i64,
-    /// Path of the statistics file
-    pub statistics_path: String,
-    /// File size in bytes
-    pub file_size_in_bytes: i64,
-}
diff --git a/crates/sail-iceberg/src/spec/table_metadata.rs b/crates/sail-iceberg/src/spec/table_metadata.rs
index f7d901bf5a..ec91472489 100644
--- a/crates/sail-iceberg/src/spec/table_metadata.rs
+++ b/crates/sail-iceberg/src/spec/table_metadata.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/table_metadata.rs
+
 use std::collections::HashMap;
 
 use serde::{Deserialize, Serialize};
diff --git a/crates/sail-iceberg/src/spec/table_metadata_builder.rs b/crates/sail-iceberg/src/spec/table_metadata_builder.rs
index 1f70a8693a..8c9e880449 100644
--- a/crates/sail-iceberg/src/spec/table_metadata_builder.rs
+++ b/crates/sail-iceberg/src/spec/table_metadata_builder.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/table_metadata_builder.rs
+
 #[allow(dead_code)]
 #[derive(Debug, Clone)]
 pub struct TableMetadataBuilder;
diff --git a/crates/sail-iceberg/src/spec/transform.rs b/crates/sail-iceberg/src/spec/transform.rs
index 50571aa4c4..a3ed29ecb2 100644
--- a/crates/sail-iceberg/src/spec/transform.rs
+++ b/crates/sail-iceberg/src/spec/transform.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/transform.rs
+
 use std::fmt::{Display, Formatter};
 use std::str::FromStr;
 
diff --git a/crates/sail-iceberg/src/spec/values.rs b/crates/sail-iceberg/src/spec/values.rs
index e8acd48020..9e595db4ba 100644
--- a/crates/sail-iceberg/src/spec/values.rs
+++ b/crates/sail-iceberg/src/spec/values.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/values.rs
+
 use ordered_float::OrderedFloat;
 use serde::{Deserialize, Serialize};
 use serde_json::Value as JsonValue;
diff --git a/crates/sail-iceberg/src/spec/view_metadata.rs b/crates/sail-iceberg/src/spec/view_metadata.rs
index cac25bce55..989a59ff58 100644
--- a/crates/sail-iceberg/src/spec/view_metadata.rs
+++ b/crates/sail-iceberg/src/spec/view_metadata.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/view_metadata.rs
+
 #[allow(dead_code)]
 #[derive(Debug, Clone)]
 pub struct ViewMetadata;
diff --git a/crates/sail-iceberg/src/spec/view_metadata_builder.rs b/crates/sail-iceberg/src/spec/view_metadata_builder.rs
index 62614fdbca..e8554c2a55 100644
--- a/crates/sail-iceberg/src/spec/view_metadata_builder.rs
+++ b/crates/sail-iceberg/src/spec/view_metadata_builder.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/view_metadata_builder.rs
+
 #[allow(dead_code)]
 #[derive(Debug, Clone)]
 pub struct ViewMetadataBuilder;
diff --git a/crates/sail-iceberg/src/spec/view_version.rs b/crates/sail-iceberg/src/spec/view_version.rs
index 816828568e..6610eecbcb 100644
--- a/crates/sail-iceberg/src/spec/view_version.rs
+++ b/crates/sail-iceberg/src/spec/view_version.rs
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/view_version.rs
+
 #[allow(dead_code)]
 #[derive(Debug, Clone)]
 pub struct ViewVersion;

From b7b1283c62ebe02e26762d4e70d50a67882541ed Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Mon, 13 Oct 2025 14:19:56 +0800
Subject: [PATCH 12/32] update

---
 .../sail-iceberg/src/datasource/provider.rs   |  5 +-
 crates/sail-iceberg/src/spec/catalog/mod.rs   | 33 +----------
 .../src/spec/manifest/data_file.rs            |  3 +-
 crates/sail-iceberg/src/spec/manifest/mod.rs  |  6 +-
 .../src/spec/{ => metadata}/format.rs         |  0
 .../spec/{view_version.rs => metadata/mod.rs} | 12 ++--
 .../src/spec/{ => metadata}/statistic_file.rs |  0
 .../src/spec/{ => metadata}/table_metadata.rs | 10 ++--
 .../{ => metadata}/table_metadata_builder.rs  |  0
 crates/sail-iceberg/src/spec/mod.rs           | 28 +++-------
 crates/sail-iceberg/src/spec/partition.rs     |  2 +-
 crates/sail-iceberg/src/spec/schema.rs        |  2 +-
 crates/sail-iceberg/src/spec/snapshots/mod.rs | 22 ++++++++
 .../src/spec/{ => snapshots}/snapshot.rs      | 56 +++++++++----------
 .../summary.rs}                               |  2 -
 crates/sail-iceberg/src/spec/transform.rs     |  4 +-
 .../src/spec/{datatypes.rs => types/mod.rs}   | 16 +++---
 .../src/spec/{ => types}/values.rs            | 11 ++--
 crates/sail-iceberg/src/spec/views/mod.rs     | 24 ++++++++
 .../src/spec/{ => views}/view_metadata.rs     |  3 +-
 .../spec/{ => views}/view_metadata_builder.rs |  3 +-
 .../src/spec/views/view_version.rs            | 55 ++++++++++++++++++
 22 files changed, 180 insertions(+), 117 deletions(-)
 rename crates/sail-iceberg/src/spec/{ => metadata}/format.rs (100%)
 rename crates/sail-iceberg/src/spec/{view_version.rs => metadata/mod.rs} (78%)
 rename crates/sail-iceberg/src/spec/{ => metadata}/statistic_file.rs (100%)
 rename crates/sail-iceberg/src/spec/{ => metadata}/table_metadata.rs (95%)
 rename crates/sail-iceberg/src/spec/{ => metadata}/table_metadata_builder.rs (100%)
 create mode 100644 crates/sail-iceberg/src/spec/snapshots/mod.rs
 rename crates/sail-iceberg/src/spec/{ => snapshots}/snapshot.rs (99%)
 rename crates/sail-iceberg/src/spec/{snapshot_summary.rs => snapshots/summary.rs} (85%)
 rename crates/sail-iceberg/src/spec/{datatypes.rs => types/mod.rs} (98%)
 rename crates/sail-iceberg/src/spec/{ => types}/values.rs (90%)
 create mode 100644 crates/sail-iceberg/src/spec/views/mod.rs
 rename crates/sail-iceberg/src/spec/{ => views}/view_metadata.rs (85%)
 rename crates/sail-iceberg/src/spec/{ => views}/view_metadata_builder.rs (84%)
 create mode 100644 crates/sail-iceberg/src/spec/views/view_version.rs

diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
index aedbdb492f..86c135b944 100644
--- a/crates/sail-iceberg/src/datasource/provider.rs
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -22,9 +22,10 @@ use object_store::ObjectMeta;
 use url::Url;
 
 use crate::arrow_conversion::iceberg_schema_to_arrow;
+use crate::spec::types::values::{Literal, PrimitiveLiteral};
 use crate::spec::{
-    DataFile, FormatVersion, Literal, Manifest, ManifestContentType, ManifestList, ManifestStatus,
-    PrimitiveLiteral, Schema, Snapshot,
+    DataFile, FormatVersion, Manifest, ManifestContentType, ManifestList, ManifestStatus, Schema,
+    Snapshot,
 };
 
 /// Iceberg table provider for DataFusion
diff --git a/crates/sail-iceberg/src/spec/catalog/mod.rs b/crates/sail-iceberg/src/spec/catalog/mod.rs
index e7d31ed792..d643d670be 100644
--- a/crates/sail-iceberg/src/spec/catalog/mod.rs
+++ b/crates/sail-iceberg/src/spec/catalog/mod.rs
@@ -24,6 +24,7 @@ use serde::{Deserialize, Serialize};
 use uuid::Uuid;
 
 use crate::spec::partition_unbound::UnboundPartitionSpec;
+use crate::spec::views::ViewVersion;
 use crate::spec::{
     FormatVersion, PartitionStatisticsFile, Schema, SchemaId, Snapshot, SnapshotReference,
     SortOrder, StatisticsFile,
@@ -296,38 +297,6 @@ pub enum ViewFormatVersion {
     V1 = 1,
 }
 
-#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
-#[serde(tag = "type", rename_all = "kebab-case")]
-pub enum ViewRepresentation {
-    #[serde(rename_all = "kebab-case")]
-    Sql {
-        sql: String,
-        #[serde(default)]
-        dialect: String,
-    },
-}
-
-#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
-pub struct ViewRepresentations(pub Vec<ViewRepresentation>);
-
-#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
-#[serde(rename_all = "kebab-case")]
-pub struct ViewVersion {
-    #[serde(rename = "version-id")]
-    pub version_id: i32,
-    #[serde(rename = "timestamp-ms")]
-    pub timestamp_ms: i64,
-    #[serde(rename = "schema-id")]
-    pub schema_id: SchemaId,
-    #[serde(default, rename = "default-catalog")]
-    pub default_catalog: Option<String>,
-    #[serde(rename = "default-namespace")]
-    pub default_namespace: NamespaceIdent,
-    #[serde(default)]
-    pub summary: HashMap<String, String>,
-    pub representations: ViewRepresentations,
-}
-
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 #[serde(tag = "action", rename_all = "kebab-case")]
 pub enum ViewUpdate {
diff --git a/crates/sail-iceberg/src/spec/manifest/data_file.rs b/crates/sail-iceberg/src/spec/manifest/data_file.rs
index 83333e597f..d660dd1d53 100644
--- a/crates/sail-iceberg/src/spec/manifest/data_file.rs
+++ b/crates/sail-iceberg/src/spec/manifest/data_file.rs
@@ -21,7 +21,8 @@ use std::collections::HashMap;
 
 use serde::{Deserialize, Serialize};
 
-use crate::spec::{Datum, Literal};
+use crate::spec::types::values::Literal;
+use crate::spec::Datum;
 
 /// Content type of a data file.
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
diff --git a/crates/sail-iceberg/src/spec/manifest/mod.rs b/crates/sail-iceberg/src/spec/manifest/mod.rs
index ed328b66b0..b9ec7faf03 100644
--- a/crates/sail-iceberg/src/spec/manifest/mod.rs
+++ b/crates/sail-iceberg/src/spec/manifest/mod.rs
@@ -123,9 +123,9 @@ impl Manifest {
 }
 
 // Helper functions used by Avro serde to parse partition values and bounds
-use crate::spec::datatypes::Type;
-use crate::spec::values::Literal;
-use crate::spec::{Datum, PrimitiveLiteral, PrimitiveType};
+use crate::spec::types::values::{Literal, PrimitiveLiteral};
+use crate::spec::types::Type;
+use crate::spec::{Datum, PrimitiveType};
 
 pub(super) fn parse_partition_values(json: Option<&serde_json::Value>) -> Vec<Option<Literal>> {
     match json {
diff --git a/crates/sail-iceberg/src/spec/format.rs b/crates/sail-iceberg/src/spec/metadata/format.rs
similarity index 100%
rename from crates/sail-iceberg/src/spec/format.rs
rename to crates/sail-iceberg/src/spec/metadata/format.rs
diff --git a/crates/sail-iceberg/src/spec/view_version.rs b/crates/sail-iceberg/src/spec/metadata/mod.rs
similarity index 78%
rename from crates/sail-iceberg/src/spec/view_version.rs
rename to crates/sail-iceberg/src/spec/metadata/mod.rs
index 6610eecbcb..5f76b8a2e8 100644
--- a/crates/sail-iceberg/src/spec/view_version.rs
+++ b/crates/sail-iceberg/src/spec/metadata/mod.rs
@@ -15,8 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/view_version.rs
+pub mod format;
+pub mod statistic_file;
+pub mod table_metadata;
+pub mod table_metadata_builder;
 
-#[allow(dead_code)]
-#[derive(Debug, Clone)]
-pub struct ViewVersion;
+pub use format::*;
+pub use statistic_file::*;
+pub use table_metadata::*;
+pub use table_metadata_builder::*;
diff --git a/crates/sail-iceberg/src/spec/statistic_file.rs b/crates/sail-iceberg/src/spec/metadata/statistic_file.rs
similarity index 100%
rename from crates/sail-iceberg/src/spec/statistic_file.rs
rename to crates/sail-iceberg/src/spec/metadata/statistic_file.rs
diff --git a/crates/sail-iceberg/src/spec/table_metadata.rs b/crates/sail-iceberg/src/spec/metadata/table_metadata.rs
similarity index 95%
rename from crates/sail-iceberg/src/spec/table_metadata.rs
rename to crates/sail-iceberg/src/spec/metadata/table_metadata.rs
index ec91472489..cb79b3bbaf 100644
--- a/crates/sail-iceberg/src/spec/table_metadata.rs
+++ b/crates/sail-iceberg/src/spec/metadata/table_metadata.rs
@@ -22,10 +22,12 @@ use std::collections::HashMap;
 use serde::{Deserialize, Serialize};
 use uuid::Uuid;
 
-use super::{
-    FormatVersion, PartitionSpec, PartitionStatisticsFile, Schema, Snapshot, SnapshotReference,
-    SortOrder, StatisticsFile,
-};
+use crate::spec::metadata::format::FormatVersion;
+use crate::spec::metadata::statistic_file::{PartitionStatisticsFile, StatisticsFile};
+use crate::spec::partition::PartitionSpec;
+use crate::spec::schema::Schema;
+use crate::spec::snapshots::{Snapshot, SnapshotReference};
+use crate::spec::sort::SortOrder;
 
 /// Iceberg table metadata
 #[derive(Debug, Clone, Serialize, Deserialize)]
diff --git a/crates/sail-iceberg/src/spec/table_metadata_builder.rs b/crates/sail-iceberg/src/spec/metadata/table_metadata_builder.rs
similarity index 100%
rename from crates/sail-iceberg/src/spec/table_metadata_builder.rs
rename to crates/sail-iceberg/src/spec/metadata/table_metadata_builder.rs
diff --git a/crates/sail-iceberg/src/spec/mod.rs b/crates/sail-iceberg/src/spec/mod.rs
index 11102ff3d2..941dee9794 100644
--- a/crates/sail-iceberg/src/spec/mod.rs
+++ b/crates/sail-iceberg/src/spec/mod.rs
@@ -18,45 +18,33 @@
 // [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/mod.rs
 
 pub mod catalog;
-pub mod datatypes;
 pub mod encrypted_key;
-pub mod format;
 pub mod manifest;
 pub mod manifest_list;
+pub mod metadata;
 pub mod name_mapping;
 pub mod partition;
 pub mod partition_unbound;
 pub mod schema;
-pub mod snapshot;
-pub mod snapshot_summary;
+pub mod snapshots;
 pub mod sort;
-pub mod statistic_file;
-pub mod table_metadata;
-pub mod table_metadata_builder;
 pub mod transform;
-pub mod values;
-pub mod view_metadata;
-pub mod view_metadata_builder;
+pub mod types;
+pub mod views;
 
 pub use catalog::*;
-pub use datatypes::*;
 pub use encrypted_key::*;
-pub use format::*;
 pub use manifest::*;
 pub use manifest_list::*;
+pub use metadata::*;
 pub use name_mapping::*;
 pub use partition::*;
 pub use partition_unbound::*;
 pub use schema::*;
-pub use snapshot::*;
-pub use snapshot_summary::*;
+pub use snapshots::*;
 pub use sort::*;
-pub use statistic_file::*;
-pub use table_metadata::*;
-pub use table_metadata_builder::*;
 pub use transform::*;
-pub use values::*;
-pub use view_metadata::*;
-pub use view_metadata_builder::*;
+pub use types::*;
+pub use views::*;
 
 pub mod schema_utils;
diff --git a/crates/sail-iceberg/src/spec/partition.rs b/crates/sail-iceberg/src/spec/partition.rs
index 3af426e70d..4c36721c92 100644
--- a/crates/sail-iceberg/src/spec/partition.rs
+++ b/crates/sail-iceberg/src/spec/partition.rs
@@ -21,9 +21,9 @@ use std::sync::Arc;
 
 use serde::{Deserialize, Serialize};
 
-use super::datatypes::{NestedField, StructType};
 use super::schema::Schema;
 use super::transform::Transform;
+use super::types::{NestedField, StructType};
 
 #[allow(unused)]
 pub(crate) const UNPARTITIONED_LAST_ASSIGNED_ID: i32 = 999;
diff --git a/crates/sail-iceberg/src/spec/schema.rs b/crates/sail-iceberg/src/spec/schema.rs
index ab7211b47a..aa6422ce5d 100644
--- a/crates/sail-iceberg/src/spec/schema.rs
+++ b/crates/sail-iceberg/src/spec/schema.rs
@@ -23,7 +23,7 @@ use std::sync::Arc;
 
 use serde::{Deserialize, Serialize};
 
-use super::datatypes::{NestedFieldRef, PrimitiveType, StructType, Type};
+use super::types::{NestedFieldRef, PrimitiveType, StructType, Type};
 
 /// Type alias for schema id.
 pub type SchemaId = i32;
diff --git a/crates/sail-iceberg/src/spec/snapshots/mod.rs b/crates/sail-iceberg/src/spec/snapshots/mod.rs
new file mode 100644
index 0000000000..ba291d4127
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/snapshots/mod.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub mod snapshot;
+pub mod summary;
+
+pub use snapshot::*;
+pub use summary::*;
diff --git a/crates/sail-iceberg/src/spec/snapshot.rs b/crates/sail-iceberg/src/spec/snapshots/snapshot.rs
similarity index 99%
rename from crates/sail-iceberg/src/spec/snapshot.rs
rename to crates/sail-iceberg/src/spec/snapshots/snapshot.rs
index 95e9883118..c9d764abf2 100644
--- a/crates/sail-iceberg/src/spec/snapshot.rs
+++ b/crates/sail-iceberg/src/spec/snapshots/snapshot.rs
@@ -23,7 +23,7 @@ use std::sync::Arc;
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
 
-use super::schema::SchemaId;
+use crate::spec::schema::SchemaId;
 
 /// The ref name of the main branch of the table.
 pub const MAIN_BRANCH: &str = "main";
@@ -66,33 +66,6 @@ impl Default for Operation {
 }
 
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
-/// Summarises the changes in the snapshot.
-pub struct Summary {
-    /// The type of operation in the snapshot
-    pub operation: Operation,
-    /// Other summary data.
-    #[serde(flatten)]
-    pub additional_properties: HashMap<String, String>,
-}
-
-impl Summary {
-    /// Create a new summary with the given operation.
-    pub fn new(operation: Operation) -> Self {
-        Self {
-            operation,
-            additional_properties: HashMap::new(),
-        }
-    }
-
-    /// Add additional property to the summary.
-    pub fn with_property(mut self, key: impl ToString, value: impl ToString) -> Self {
-        self.additional_properties
-            .insert(key.to_string(), value.to_string());
-        self
-    }
-}
-
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 #[serde(rename_all = "kebab-case")]
 /// A snapshot represents the state of a table at some time and is used to access the complete set of data files in the table.
 pub struct Snapshot {
@@ -164,6 +137,33 @@ pub enum SnapshotRetention {
     },
 }
 
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+/// Summarises the changes in the snapshot.
+pub struct Summary {
+    /// The type of operation in the snapshot
+    pub operation: Operation,
+    /// Other summary data.
+    #[serde(flatten)]
+    pub additional_properties: HashMap<String, String>,
+}
+
+impl Summary {
+    /// Create a new summary with the given operation.
+    pub fn new(operation: Operation) -> Self {
+        Self {
+            operation,
+            additional_properties: HashMap::new(),
+        }
+    }
+
+    /// Add additional property to the summary.
+    pub fn with_property(mut self, key: impl ToString, value: impl ToString) -> Self {
+        self.additional_properties
+            .insert(key.to_string(), value.to_string());
+        self
+    }
+}
+
 impl Snapshot {
     /// Create a new snapshot builder.
     pub fn builder() -> SnapshotBuilder {
diff --git a/crates/sail-iceberg/src/spec/snapshot_summary.rs b/crates/sail-iceberg/src/spec/snapshots/summary.rs
similarity index 85%
rename from crates/sail-iceberg/src/spec/snapshot_summary.rs
rename to crates/sail-iceberg/src/spec/snapshots/summary.rs
index 57e877038b..86b0960ed5 100644
--- a/crates/sail-iceberg/src/spec/snapshot_summary.rs
+++ b/crates/sail-iceberg/src/spec/snapshots/summary.rs
@@ -15,8 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/snapshot_summary.rs
-
 // Awareness stub for read-path. Extend later if needed.
 #[allow(dead_code)]
 #[derive(Debug, Clone)]
diff --git a/crates/sail-iceberg/src/spec/transform.rs b/crates/sail-iceberg/src/spec/transform.rs
index a3ed29ecb2..b6ffad9a51 100644
--- a/crates/sail-iceberg/src/spec/transform.rs
+++ b/crates/sail-iceberg/src/spec/transform.rs
@@ -22,8 +22,8 @@ use std::str::FromStr;
 
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 
-use super::datatypes::{PrimitiveType, Type};
-use super::values::Literal;
+use super::types::values::Literal;
+use super::types::{PrimitiveType, Type};
 
 /// Transform is used to transform predicates to partition predicates,
 /// in addition to transforming data values.
diff --git a/crates/sail-iceberg/src/spec/datatypes.rs b/crates/sail-iceberg/src/spec/types/mod.rs
similarity index 98%
rename from crates/sail-iceberg/src/spec/datatypes.rs
rename to crates/sail-iceberg/src/spec/types/mod.rs
index 88457500f8..4be7e1b9b7 100644
--- a/crates/sail-iceberg/src/spec/datatypes.rs
+++ b/crates/sail-iceberg/src/spec/types/mod.rs
@@ -17,6 +17,8 @@
 
 // [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/datatypes.rs
 
+pub mod values;
+
 use std::collections::HashMap;
 use std::fmt;
 use std::ops::Index;
@@ -25,9 +27,7 @@ use std::sync::{Arc, OnceLock};
 use serde::de::{Error, IntoDeserializer, MapAccess, Visitor};
 use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
 use serde_json::Value as JsonValue;
-
-use super::values::Literal;
-use crate::spec::PrimitiveLiteral;
+pub use values::*;
 
 /// Field name for list type.
 pub const LIST_FIELD_NAME: &str = "element";
@@ -42,7 +42,7 @@ pub(crate) const MAX_DECIMAL_PRECISION: u32 = 38;
 mod _decimal {
     use once_cell::sync::Lazy;
 
-    use crate::spec::{MAX_DECIMAL_BYTES, MAX_DECIMAL_PRECISION};
+    use super::{MAX_DECIMAL_BYTES, MAX_DECIMAL_PRECISION};
 
     // Max precision of bytes, starts from 1
     pub(super) static MAX_PRECISION: Lazy<[u32; MAX_DECIMAL_BYTES as usize]> = Lazy::new(|| {
@@ -720,10 +720,8 @@ pub(super) mod _serde {
 
     use serde::{Deserialize, Serialize};
 
-    use crate::spec::datatypes::Type::Map;
-    use crate::spec::datatypes::{
-        ListType, MapType, NestedField, NestedFieldRef, PrimitiveType, StructType, Type,
-    };
+    use super::Type::Map;
+    use super::{ListType, MapType, NestedField, NestedFieldRef, PrimitiveType, StructType, Type};
 
     /// List type for serialization and deserialization
     #[derive(Serialize, Deserialize)]
@@ -787,7 +785,7 @@ pub(super) mod _serde {
                 SerdeType::Struct { r#type: _, fields } => {
                     Self::Struct(StructType::new(fields.into_owned()))
                 }
-                SerdeType::Primitive(p) => Self::Primitive(p),
+                SerdeType::Primitive(p) => Self::Primitive(p.clone()),
             }
         }
     }
diff --git a/crates/sail-iceberg/src/spec/values.rs b/crates/sail-iceberg/src/spec/types/values.rs
similarity index 90%
rename from crates/sail-iceberg/src/spec/values.rs
rename to crates/sail-iceberg/src/spec/types/values.rs
index 9e595db4ba..3386abf1f1 100644
--- a/crates/sail-iceberg/src/spec/values.rs
+++ b/crates/sail-iceberg/src/spec/types/values.rs
@@ -51,13 +51,13 @@ pub enum PrimitiveLiteral {
 /// Typed single-value used for lower/upper bounds
 pub struct Datum {
     /// Primitive data type of the datum
-    pub r#type: crate::spec::PrimitiveType,
+    pub r#type: crate::spec::types::PrimitiveType,
     /// Primitive literal value
     pub literal: PrimitiveLiteral,
 }
 
 impl Datum {
-    pub fn new(r#type: crate::spec::PrimitiveType, literal: PrimitiveLiteral) -> Self {
+    pub fn new(r#type: crate::spec::types::PrimitiveType, literal: PrimitiveLiteral) -> Self {
         Self { r#type, literal }
     }
 }
@@ -66,7 +66,7 @@ impl Literal {
     // TODO: Type-aware JSON conversion
     pub fn try_from_json(
         value: JsonValue,
-        _data_type: &crate::spec::Type,
+        _data_type: &crate::spec::types::Type,
     ) -> Result<Option<Self>, String> {
         match value {
             JsonValue::Null => Ok(None),
@@ -77,7 +77,10 @@ impl Literal {
     }
 
     // TODO: Type-aware JSON conversion
-    pub fn try_into_json(&self, _data_type: &crate::spec::Type) -> Result<JsonValue, String> {
+    pub fn try_into_json(
+        &self,
+        _data_type: &crate::spec::types::Type,
+    ) -> Result<JsonValue, String> {
         match self {
             Literal::Primitive(p) => match p {
                 PrimitiveLiteral::Boolean(v) => Ok(JsonValue::Bool(*v)),
diff --git a/crates/sail-iceberg/src/spec/views/mod.rs b/crates/sail-iceberg/src/spec/views/mod.rs
new file mode 100644
index 0000000000..d58a3264f1
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/views/mod.rs
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub mod view_metadata;
+pub mod view_metadata_builder;
+pub mod view_version;
+
+pub use view_metadata::*;
+pub use view_metadata_builder::*;
+pub use view_version::*;
diff --git a/crates/sail-iceberg/src/spec/view_metadata.rs b/crates/sail-iceberg/src/spec/views/view_metadata.rs
similarity index 85%
rename from crates/sail-iceberg/src/spec/view_metadata.rs
rename to crates/sail-iceberg/src/spec/views/view_metadata.rs
index 989a59ff58..c14058fc72 100644
--- a/crates/sail-iceberg/src/spec/view_metadata.rs
+++ b/crates/sail-iceberg/src/spec/views/view_metadata.rs
@@ -15,8 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/view_metadata.rs
-
+// Awareness stub
 #[allow(dead_code)]
 #[derive(Debug, Clone)]
 pub struct ViewMetadata;
diff --git a/crates/sail-iceberg/src/spec/view_metadata_builder.rs b/crates/sail-iceberg/src/spec/views/view_metadata_builder.rs
similarity index 84%
rename from crates/sail-iceberg/src/spec/view_metadata_builder.rs
rename to crates/sail-iceberg/src/spec/views/view_metadata_builder.rs
index e8554c2a55..249350e367 100644
--- a/crates/sail-iceberg/src/spec/view_metadata_builder.rs
+++ b/crates/sail-iceberg/src/spec/views/view_metadata_builder.rs
@@ -15,8 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-// [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/view_metadata_builder.rs
-
+// Awareness stub
 #[allow(dead_code)]
 #[derive(Debug, Clone)]
 pub struct ViewMetadataBuilder;
diff --git a/crates/sail-iceberg/src/spec/views/view_version.rs b/crates/sail-iceberg/src/spec/views/view_version.rs
new file mode 100644
index 0000000000..17826713bc
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/views/view_version.rs
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+
+use crate::spec::catalog::NamespaceIdent;
+use crate::spec::SchemaId;
+
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub struct ViewVersion {
+    #[serde(rename = "version-id")]
+    pub version_id: i32,
+    #[serde(rename = "timestamp-ms")]
+    pub timestamp_ms: i64,
+    #[serde(rename = "schema-id")]
+    pub schema_id: SchemaId,
+    #[serde(default, rename = "default-catalog")]
+    pub default_catalog: Option<String>,
+    #[serde(rename = "default-namespace")]
+    pub default_namespace: NamespaceIdent,
+    #[serde(default)]
+    pub summary: HashMap<String, String>,
+    pub representations: ViewRepresentations,
+}
+
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct ViewRepresentations(pub Vec<ViewRepresentation>);
+
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "kebab-case")]
+pub enum ViewRepresentation {
+    #[serde(rename_all = "kebab-case")]
+    Sql {
+        sql: String,
+        #[serde(default)]
+        dialect: String,
+    },
+}

From 68d5d2212aef292a049c5f989f26bf7968ea0859 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Mon, 13 Oct 2025 14:26:07 +0800
Subject: [PATCH 13/32] update

---
 crates/sail-iceberg/src/spec/catalog/mod.rs   |  2 +-
 crates/sail-iceberg/src/spec/mod.rs           |  4 ----
 crates/sail-iceberg/src/spec/partition/mod.rs | 22 +++++++++++++++++++
 .../spec/{partition.rs => partition/spec.rs}  |  6 ++---
 .../unbound.rs}                               |  0
 .../src/spec/{schema.rs => schema/mod.rs}     |  6 ++++-
 .../spec/{schema_utils.rs => schema/utils.rs} |  3 ++-
 7 files changed, 33 insertions(+), 10 deletions(-)
 create mode 100644 crates/sail-iceberg/src/spec/partition/mod.rs
 rename crates/sail-iceberg/src/spec/{partition.rs => partition/spec.rs} (98%)
 rename crates/sail-iceberg/src/spec/{partition_unbound.rs => partition/unbound.rs} (100%)
 rename crates/sail-iceberg/src/spec/{schema.rs => schema/mod.rs} (99%)
 rename crates/sail-iceberg/src/spec/{schema_utils.rs => schema/utils.rs} (97%)

diff --git a/crates/sail-iceberg/src/spec/catalog/mod.rs b/crates/sail-iceberg/src/spec/catalog/mod.rs
index d643d670be..4ed243c179 100644
--- a/crates/sail-iceberg/src/spec/catalog/mod.rs
+++ b/crates/sail-iceberg/src/spec/catalog/mod.rs
@@ -23,7 +23,7 @@ use std::fmt::Display;
 use serde::{Deserialize, Serialize};
 use uuid::Uuid;
 
-use crate::spec::partition_unbound::UnboundPartitionSpec;
+use crate::spec::partition::UnboundPartitionSpec;
 use crate::spec::views::ViewVersion;
 use crate::spec::{
     FormatVersion, PartitionStatisticsFile, Schema, SchemaId, Snapshot, SnapshotReference,
diff --git a/crates/sail-iceberg/src/spec/mod.rs b/crates/sail-iceberg/src/spec/mod.rs
index 941dee9794..4d95e97f84 100644
--- a/crates/sail-iceberg/src/spec/mod.rs
+++ b/crates/sail-iceberg/src/spec/mod.rs
@@ -24,7 +24,6 @@ pub mod manifest_list;
 pub mod metadata;
 pub mod name_mapping;
 pub mod partition;
-pub mod partition_unbound;
 pub mod schema;
 pub mod snapshots;
 pub mod sort;
@@ -39,12 +38,9 @@ pub use manifest_list::*;
 pub use metadata::*;
 pub use name_mapping::*;
 pub use partition::*;
-pub use partition_unbound::*;
 pub use schema::*;
 pub use snapshots::*;
 pub use sort::*;
 pub use transform::*;
 pub use types::*;
 pub use views::*;
-
-pub mod schema_utils;
diff --git a/crates/sail-iceberg/src/spec/partition/mod.rs b/crates/sail-iceberg/src/spec/partition/mod.rs
new file mode 100644
index 0000000000..457e09ea3c
--- /dev/null
+++ b/crates/sail-iceberg/src/spec/partition/mod.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub mod spec;
+pub mod unbound;
+
+pub use spec::*;
+pub use unbound::*;
diff --git a/crates/sail-iceberg/src/spec/partition.rs b/crates/sail-iceberg/src/spec/partition/spec.rs
similarity index 98%
rename from crates/sail-iceberg/src/spec/partition.rs
rename to crates/sail-iceberg/src/spec/partition/spec.rs
index 4c36721c92..d55f373271 100644
--- a/crates/sail-iceberg/src/spec/partition.rs
+++ b/crates/sail-iceberg/src/spec/partition/spec.rs
@@ -21,9 +21,9 @@ use std::sync::Arc;
 
 use serde::{Deserialize, Serialize};
 
-use super::schema::Schema;
-use super::transform::Transform;
-use super::types::{NestedField, StructType};
+use crate::spec::schema::Schema;
+use crate::spec::transform::Transform;
+use crate::spec::types::{NestedField, StructType};
 
 #[allow(unused)]
 pub(crate) const UNPARTITIONED_LAST_ASSIGNED_ID: i32 = 999;
diff --git a/crates/sail-iceberg/src/spec/partition_unbound.rs b/crates/sail-iceberg/src/spec/partition/unbound.rs
similarity index 100%
rename from crates/sail-iceberg/src/spec/partition_unbound.rs
rename to crates/sail-iceberg/src/spec/partition/unbound.rs
diff --git a/crates/sail-iceberg/src/spec/schema.rs b/crates/sail-iceberg/src/spec/schema/mod.rs
similarity index 99%
rename from crates/sail-iceberg/src/spec/schema.rs
rename to crates/sail-iceberg/src/spec/schema/mod.rs
index aa6422ce5d..6557a6c67c 100644
--- a/crates/sail-iceberg/src/spec/schema.rs
+++ b/crates/sail-iceberg/src/spec/schema/mod.rs
@@ -23,7 +23,11 @@ use std::sync::Arc;
 
 use serde::{Deserialize, Serialize};
 
-use super::types::{NestedFieldRef, PrimitiveType, StructType, Type};
+use crate::spec::types::{NestedFieldRef, PrimitiveType, StructType, Type};
+
+pub mod utils;
+
+pub use utils::*;
 
 /// Type alias for schema id.
 pub type SchemaId = i32;
diff --git a/crates/sail-iceberg/src/spec/schema_utils.rs b/crates/sail-iceberg/src/spec/schema/utils.rs
similarity index 97%
rename from crates/sail-iceberg/src/spec/schema_utils.rs
rename to crates/sail-iceberg/src/spec/schema/utils.rs
index 79523136e4..dd6421577e 100644
--- a/crates/sail-iceberg/src/spec/schema_utils.rs
+++ b/crates/sail-iceberg/src/spec/schema/utils.rs
@@ -19,7 +19,8 @@
 
 use std::collections::{HashMap, HashSet, VecDeque};
 
-use crate::spec::{NestedFieldRef, Schema, Type};
+use super::Schema;
+use crate::spec::types::{NestedFieldRef, Type};
 
 /// Visit all fields in a schema in breadth-first order, calling the callback for each field id.
 pub fn visit_fields_bfs<F: FnMut(i32, &NestedFieldRef)>(schema: &Schema, mut f: F) {

From c4797a375f4fe23b4472b0ddca6d3d5fd0370aa0 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Mon, 13 Oct 2025 15:05:28 +0800
Subject: [PATCH 14/32] pruning

---
 .../src/datasource/expressions.rs             |  39 ++++
 crates/sail-iceberg/src/datasource/mod.rs     |   2 +
 .../sail-iceberg/src/datasource/provider.rs   |  46 +++-
 crates/sail-iceberg/src/datasource/pruning.rs | 204 ++++++++++++++++++
 4 files changed, 288 insertions(+), 3 deletions(-)
 create mode 100644 crates/sail-iceberg/src/datasource/expressions.rs
 create mode 100644 crates/sail-iceberg/src/datasource/pruning.rs

diff --git a/crates/sail-iceberg/src/datasource/expressions.rs b/crates/sail-iceberg/src/datasource/expressions.rs
new file mode 100644
index 0000000000..3e79d835e2
--- /dev/null
+++ b/crates/sail-iceberg/src/datasource/expressions.rs
@@ -0,0 +1,39 @@
+use std::sync::Arc;
+
+use datafusion::catalog::Session;
+use datafusion::common::DFSchema;
+use datafusion::logical_expr::execution_props::ExecutionProps;
+use datafusion::logical_expr::simplify::SimplifyContext;
+use datafusion::logical_expr::{Expr, TableProviderFilterPushDown};
+use datafusion::optimizer::simplify_expressions::ExprSimplifier;
+use datafusion::physical_expr::PhysicalExpr;
+
+pub fn simplify_expr(
+    session: &dyn Session,
+    df_schema: &DFSchema,
+    expr: Expr,
+) -> Arc<dyn PhysicalExpr> {
+    let props = ExecutionProps::new();
+    let simplify_context = SimplifyContext::new(&props).with_schema(df_schema.clone().into());
+    let simplifier = ExprSimplifier::new(simplify_context).with_max_cycles(10);
+    #[allow(clippy::expect_used)]
+    let simplified = simplifier
+        .simplify(expr)
+        .expect("Failed to simplify expression");
+    #[allow(clippy::expect_used)]
+    session
+        .create_physical_expr(simplified, df_schema)
+        .expect("Failed to create physical expression")
+}
+
+pub fn get_pushdown_filters(
+    filter: &[&Expr],
+    _partition_cols: &[String],
+) -> Vec<TableProviderFilterPushDown> {
+    // Conservatively mark filters as Inexact for now; refine with partition-aware analysis later.
+    // TODO: Partition-aware
+    filter
+        .iter()
+        .map(|_| TableProviderFilterPushDown::Inexact)
+        .collect()
+}
diff --git a/crates/sail-iceberg/src/datasource/mod.rs b/crates/sail-iceberg/src/datasource/mod.rs
index 588b4a12ef..629fbff659 100644
--- a/crates/sail-iceberg/src/datasource/mod.rs
+++ b/crates/sail-iceberg/src/datasource/mod.rs
@@ -1,3 +1,5 @@
+pub mod expressions;
 pub mod provider;
+pub mod pruning;
 
 pub use provider::*;
diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
index 86c135b944..483858e534 100644
--- a/crates/sail-iceberg/src/datasource/provider.rs
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -9,19 +9,23 @@ use datafusion::catalog::memory::DataSourceExec;
 use datafusion::catalog::Session;
 use datafusion::common::scalar::ScalarValue;
 use datafusion::common::stats::{ColumnStatistics, Precision, Statistics};
-use datafusion::common::Result as DataFusionResult;
+use datafusion::common::{Result as DataFusionResult, ToDFSchema};
 use datafusion::config::TableParquetOptions;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::physical_plan::{FileGroup, FileScanConfigBuilder, ParquetSource};
 use datafusion::datasource::{TableProvider, TableType};
 use datafusion::execution::object_store::ObjectStoreUrl;
+use datafusion::logical_expr::utils::conjunction;
 use datafusion::logical_expr::{Expr, LogicalPlan};
+use datafusion::physical_expr::PhysicalExpr;
 use datafusion::physical_plan::ExecutionPlan;
 use object_store::path::Path as ObjectPath;
 use object_store::ObjectMeta;
 use url::Url;
 
 use crate::arrow_conversion::iceberg_schema_to_arrow;
+use crate::datasource::expressions::simplify_expr;
+use crate::datasource::pruning::prune_files;
 use crate::spec::types::values::{Literal, PrimitiveLiteral};
 use crate::spec::{
     DataFile, FormatVersion, Manifest, ManifestContentType, ManifestList, ManifestStatus, Schema,
@@ -419,9 +423,32 @@ impl TableProvider for IcebergTableProvider {
         );
 
         log::info!("[ICEBERG] Loading data files from manifests...");
-        let data_files = self.load_data_files(&object_store, &manifest_list).await?;
+        let mut data_files = self.load_data_files(&object_store, &manifest_list).await?;
         log::info!("[ICEBERG] Loaded {} data files", data_files.len());
 
+        // TODO: Manifest-level pruning using partition summaries to avoid loading all files
+        // TODO: Partition-transform aware filtering before file-level metrics pruning
+
+        // Build filter conjunction and run DataFusion-based pruning on Iceberg metrics
+        let filter_expr = conjunction(_filters.iter().cloned());
+        let mut _pruning_mask: Option<Vec<bool>> = None;
+        if filter_expr.is_some() || limit.is_some() {
+            let (kept, mask) = prune_files(
+                session,
+                _filters,
+                limit,
+                self.arrow_schema.clone(),
+                data_files,
+                &self.schema,
+            )?;
+            _pruning_mask = mask;
+            data_files = kept;
+            log::info!(
+                "[ICEBERG] Pruned data files, remaining: {}",
+                data_files.len()
+            );
+        }
+
         log::info!("[ICEBERG] Creating partitioned files...");
         let partitioned_files = self.create_partitioned_files(data_files)?;
         log::info!(
@@ -448,7 +475,20 @@ impl TableProvider for IcebergTableProvider {
             ..Default::default()
         };
 
-        let parquet_source = Arc::new(ParquetSource::new(parquet_options));
+        let mut parquet_source = ParquetSource::new(parquet_options);
+        // Prepare pushdown filter for Parquet
+        let pushdown_filter: Option<Arc<dyn PhysicalExpr>> = if !_filters.is_empty() {
+            let df_schema = self.arrow_schema.clone().to_dfschema()?;
+            let pushdown_expr = conjunction(_filters.iter().cloned());
+            pushdown_expr.map(|expr| simplify_expr(session, &df_schema, expr))
+        } else {
+            None
+        };
+        if let Some(pred) = pushdown_filter {
+            // TODO: Consider expression adapter for Parquet pushdown
+            parquet_source = parquet_source.with_predicate(pred);
+        }
+        let parquet_source = Arc::new(parquet_source);
 
         let file_scan_config =
             FileScanConfigBuilder::new(object_store_url, file_schema, parquet_source)
diff --git a/crates/sail-iceberg/src/datasource/pruning.rs b/crates/sail-iceberg/src/datasource/pruning.rs
new file mode 100644
index 0000000000..2588e1be14
--- /dev/null
+++ b/crates/sail-iceberg/src/datasource/pruning.rs
@@ -0,0 +1,204 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use datafusion::arrow::array::{ArrayRef, BooleanArray, UInt64Array};
+use datafusion::arrow::datatypes::Schema as ArrowSchema;
+use datafusion::catalog::Session;
+use datafusion::common::pruning::PruningStatistics;
+use datafusion::common::{Column, Result, ToDFSchema};
+use datafusion::logical_expr::utils::conjunction;
+use datafusion::logical_expr::Expr;
+use datafusion::physical_optimizer::pruning::PruningPredicate;
+
+use crate::spec::types::values::{Datum, Literal};
+use crate::spec::{DataFile, Schema};
+
+fn literal_to_scalar_value_local(literal: &Literal) -> datafusion::common::scalar::ScalarValue {
+    match literal {
+        Literal::Primitive(p) => match p {
+            crate::spec::types::values::PrimitiveLiteral::Boolean(v) => {
+                datafusion::common::scalar::ScalarValue::Boolean(Some(*v))
+            }
+            crate::spec::types::values::PrimitiveLiteral::Int(v) => {
+                datafusion::common::scalar::ScalarValue::Int32(Some(*v))
+            }
+            crate::spec::types::values::PrimitiveLiteral::Long(v) => {
+                datafusion::common::scalar::ScalarValue::Int64(Some(*v))
+            }
+            crate::spec::types::values::PrimitiveLiteral::Float(v) => {
+                datafusion::common::scalar::ScalarValue::Float32(Some(v.into_inner()))
+            }
+            crate::spec::types::values::PrimitiveLiteral::Double(v) => {
+                datafusion::common::scalar::ScalarValue::Float64(Some(v.into_inner()))
+            }
+            crate::spec::types::values::PrimitiveLiteral::String(v) => {
+                datafusion::common::scalar::ScalarValue::Utf8(Some(v.clone()))
+            }
+            crate::spec::types::values::PrimitiveLiteral::Binary(v) => {
+                datafusion::common::scalar::ScalarValue::Binary(Some(v.clone()))
+            }
+            crate::spec::types::values::PrimitiveLiteral::Int128(v) => {
+                datafusion::common::scalar::ScalarValue::Decimal128(Some(*v), 38, 0)
+            }
+            crate::spec::types::values::PrimitiveLiteral::UInt128(v) => {
+                if *v <= i128::MAX as u128 {
+                    datafusion::common::scalar::ScalarValue::Decimal128(Some(*v as i128), 38, 0)
+                } else {
+                    datafusion::common::scalar::ScalarValue::Utf8(Some(v.to_string()))
+                }
+            }
+        },
+        Literal::Struct(fields) => {
+            let json_repr = serde_json::to_string(fields).unwrap_or_default();
+            datafusion::common::scalar::ScalarValue::Utf8(Some(json_repr))
+        }
+        Literal::List(items) => {
+            let json_repr = serde_json::to_string(items).unwrap_or_default();
+            datafusion::common::scalar::ScalarValue::Utf8(Some(json_repr))
+        }
+        Literal::Map(pairs) => {
+            let json_repr = serde_json::to_string(pairs).unwrap_or_default();
+            datafusion::common::scalar::ScalarValue::Utf8(Some(json_repr))
+        }
+    }
+}
+
+/// Pruning statistics over Iceberg DataFiles
+pub struct IcebergPruningStats {
+    files: Vec<DataFile>,
+    #[allow(unused)]
+    arrow_schema: Arc<ArrowSchema>,
+    /// Arrow field name -> Iceberg field id
+    name_to_field_id: HashMap<String, i32>,
+}
+
+impl IcebergPruningStats {
+    pub fn new(
+        files: Vec<DataFile>,
+        arrow_schema: Arc<ArrowSchema>,
+        iceberg_schema: &Schema,
+    ) -> Self {
+        let mut name_to_field_id = HashMap::new();
+        for f in iceberg_schema.fields().iter() {
+            name_to_field_id.insert(f.name.clone(), f.id);
+        }
+        Self {
+            files,
+            arrow_schema,
+            name_to_field_id,
+        }
+    }
+
+    fn field_id_for(&self, column: &Column) -> Option<i32> {
+        self.name_to_field_id.get(&column.name).copied()
+    }
+
+    fn datum_to_scalar(&self, datum: &Datum) -> datafusion::common::scalar::ScalarValue {
+        // Reuse existing literal conversion via Datum.literal
+        literal_to_scalar_value_local(&Literal::Primitive(datum.literal.clone()))
+    }
+}
+
+impl PruningStatistics for IcebergPruningStats {
+    fn min_values(&self, column: &Column) -> Option<ArrayRef> {
+        let field_id = self.field_id_for(column)?;
+        let scalars = self.files.iter().map(|f| {
+            f.lower_bounds()
+                .get(&field_id)
+                .map(|d| self.datum_to_scalar(d))
+        });
+        // Build an Arrow array from Option<ScalarValue>
+        let values =
+            scalars.map(|opt| opt.unwrap_or(datafusion::common::scalar::ScalarValue::Null));
+        datafusion::common::scalar::ScalarValue::iter_to_array(values).ok()
+    }
+
+    fn max_values(&self, column: &Column) -> Option<ArrayRef> {
+        let field_id = self.field_id_for(column)?;
+        let scalars = self.files.iter().map(|f| {
+            f.upper_bounds()
+                .get(&field_id)
+                .map(|d| self.datum_to_scalar(d))
+        });
+        let values =
+            scalars.map(|opt| opt.unwrap_or(datafusion::common::scalar::ScalarValue::Null));
+        datafusion::common::scalar::ScalarValue::iter_to_array(values).ok()
+    }
+
+    fn num_containers(&self) -> usize {
+        self.files.len()
+    }
+
+    fn null_counts(&self, column: &Column) -> Option<ArrayRef> {
+        let field_id = self.field_id_for(column)?;
+        let counts: Vec<u64> = self
+            .files
+            .iter()
+            .map(|f| f.null_value_counts().get(&field_id).copied().unwrap_or(0))
+            .collect();
+        Some(Arc::new(UInt64Array::from(counts)) as ArrayRef)
+    }
+
+    fn row_counts(&self, _column: &Column) -> Option<ArrayRef> {
+        let rows: Vec<u64> = self.files.iter().map(|f| f.record_count()).collect();
+        Some(Arc::new(UInt64Array::from(rows)) as ArrayRef)
+    }
+
+    fn contained(
+        &self,
+        _column: &Column,
+        _value: &std::collections::HashSet<datafusion::common::scalar::ScalarValue>,
+    ) -> Option<BooleanArray> {
+        // TODO: Partition-aware contained pruning
+        None
+    }
+}
+
+/// Prune Iceberg data files using DataFusion PruningPredicate over IcebergPruningStats
+pub fn prune_files(
+    session: &dyn Session,
+    filters: &[Expr],
+    limit: Option<usize>,
+    logical_schema: Arc<ArrowSchema>,
+    files: Vec<DataFile>,
+    iceberg_schema: &Schema,
+) -> Result<(Vec<DataFile>, Option<Vec<bool>>)> {
+    let filter_expr = conjunction(filters.iter().cloned());
+
+    if filter_expr.is_none() && limit.is_none() {
+        return Ok((files, None));
+    }
+
+    let stats = IcebergPruningStats::new(files, logical_schema.clone(), iceberg_schema);
+
+    let files_to_keep = if let Some(predicate) = &filter_expr {
+        let df_schema = logical_schema.clone().to_dfschema()?;
+        let physical_predicate = session.create_physical_expr(predicate.clone(), &df_schema)?;
+        let pruning_predicate = PruningPredicate::try_new(physical_predicate, logical_schema)?;
+        pruning_predicate.prune(&stats)?
+    } else {
+        vec![true; stats.num_containers()]
+    };
+
+    let mut kept = Vec::new();
+    let mut rows_collected: u64 = 0;
+    for (file, keep) in stats.files.into_iter().zip(files_to_keep.iter()) {
+        if *keep {
+            if let Some(lim) = limit {
+                if rows_collected <= lim as u64 {
+                    rows_collected += file.record_count();
+                    kept.push(file);
+                    if rows_collected > lim as u64 {
+                        break;
+                    }
+                } else {
+                    break;
+                }
+            } else {
+                kept.push(file);
+            }
+        }
+    }
+
+    Ok((kept, Some(files_to_keep)))
+}

From d6450f109024fd8e738a218269b1d8b9253f9227 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Mon, 13 Oct 2025 15:11:01 +0800
Subject: [PATCH 15/32] improve stat

---
 .../sail-iceberg/src/datasource/provider.rs   | 95 ++++++++++++++++++-
 1 file changed, 93 insertions(+), 2 deletions(-)

diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
index 483858e534..a39497ed8d 100644
--- a/crates/sail-iceberg/src/datasource/provider.rs
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -277,6 +277,94 @@ impl IcebergTableProvider {
         file_groups.into_values().map(FileGroup::from).collect()
     }
 
+    /// Aggregate table-level statistics from a list of Iceberg data files
+    fn aggregate_statistics(&self, data_files: &[DataFile]) -> Statistics {
+        if data_files.is_empty() {
+            return Statistics::new_unknown(&self.arrow_schema);
+        }
+
+        let mut total_rows: usize = 0;
+        let mut total_bytes: usize = 0;
+
+        // Pre-compute field id per column index
+        let field_ids: Vec<i32> = self
+            .schema
+            .fields()
+            .iter()
+            .enumerate()
+            .map(|(_, f)| f.id)
+            .collect();
+
+        // Initialize accumulators per column
+        let mut min_scalars: Vec<Option<ScalarValue>> =
+            vec![None; self.arrow_schema.fields().len()];
+        let mut max_scalars: Vec<Option<ScalarValue>> =
+            vec![None; self.arrow_schema.fields().len()];
+        let mut null_counts: Vec<usize> = vec![0; self.arrow_schema.fields().len()];
+
+        for df in data_files {
+            total_rows = total_rows.saturating_add(df.record_count() as usize);
+            total_bytes = total_bytes.saturating_add(df.file_size_in_bytes() as usize);
+
+            for (col_idx, field_id) in field_ids.iter().enumerate() {
+                // null counts
+                if let Some(c) = df.null_value_counts().get(field_id) {
+                    null_counts[col_idx] = null_counts[col_idx].saturating_add(*c as usize);
+                }
+
+                // min
+                if let Some(d) = df.lower_bounds().get(field_id) {
+                    let v = Literal::Primitive(d.literal.clone());
+                    let sv = self.literal_to_scalar_value(&v);
+                    min_scalars[col_idx] = match (&min_scalars[col_idx], &sv) {
+                        (None, s) => Some(s.clone()),
+                        (Some(existing), s) => Some(if s < existing {
+                            s.clone()
+                        } else {
+                            existing.clone()
+                        }),
+                    };
+                }
+
+                // max
+                if let Some(d) = df.upper_bounds().get(field_id) {
+                    let v = Literal::Primitive(d.literal.clone());
+                    let sv = self.literal_to_scalar_value(&v);
+                    max_scalars[col_idx] = match (&max_scalars[col_idx], &sv) {
+                        (None, s) => Some(s.clone()),
+                        (Some(existing), s) => Some(if s > existing {
+                            s.clone()
+                        } else {
+                            existing.clone()
+                        }),
+                    };
+                }
+            }
+        }
+
+        let column_statistics = (0..self.arrow_schema.fields().len())
+            .map(|i| ColumnStatistics {
+                null_count: Precision::Exact(null_counts[i]),
+                max_value: max_scalars[i]
+                    .clone()
+                    .map(Precision::Exact)
+                    .unwrap_or(Precision::Absent),
+                min_value: min_scalars[i]
+                    .clone()
+                    .map(Precision::Exact)
+                    .unwrap_or(Precision::Absent),
+                distinct_count: Precision::Absent,
+                sum_value: Precision::Absent,
+            })
+            .collect();
+
+        Statistics {
+            num_rows: Precision::Exact(total_rows),
+            total_byte_size: Precision::Exact(total_bytes),
+            column_statistics,
+        }
+    }
+
     /// Convert Iceberg Literal to DataFusion ScalarValue
     fn literal_to_scalar_value(&self, literal: &Literal) -> ScalarValue {
         match literal {
@@ -450,7 +538,7 @@ impl TableProvider for IcebergTableProvider {
         }
 
         log::info!("[ICEBERG] Creating partitioned files...");
-        let partitioned_files = self.create_partitioned_files(data_files)?;
+        let partitioned_files = self.create_partitioned_files(data_files.clone())?;
         log::info!(
             "[ICEBERG] Created {} partitioned files",
             partitioned_files.len()
@@ -490,6 +578,9 @@ impl TableProvider for IcebergTableProvider {
         }
         let parquet_source = Arc::new(parquet_source);
 
+        // Build table statistics from pruned files
+        let table_stats = self.aggregate_statistics(&data_files);
+
         let file_scan_config =
             FileScanConfigBuilder::new(object_store_url, file_schema, parquet_source)
                 .with_file_groups(if file_groups.is_empty() {
@@ -497,7 +588,7 @@ impl TableProvider for IcebergTableProvider {
                 } else {
                     file_groups
                 })
-                .with_statistics(Statistics::new_unknown(&self.arrow_schema))
+                .with_statistics(table_stats)
                 .with_projection(projection.cloned())
                 .with_limit(limit)
                 .build();

From 8b0005bc5e487d4a8acf2e057fa3a3307bc4c935 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Mon, 13 Oct 2025 15:20:33 +0800
Subject: [PATCH 16/32] manifest level

---
 .../sail-iceberg/src/datasource/provider.rs   | 49 ++++++++++++++-----
 crates/sail-iceberg/src/datasource/pruning.rs | 41 +++++++++++++++-
 2 files changed, 76 insertions(+), 14 deletions(-)

diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
index a39497ed8d..0ab400ed37 100644
--- a/crates/sail-iceberg/src/datasource/provider.rs
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -144,6 +144,8 @@ impl IcebergTableProvider {
     /// Load data files from manifests
     async fn load_data_files(
         &self,
+        session: &dyn Session,
+        filters: &[Expr],
         object_store: &Arc<dyn object_store::ObjectStore>,
         manifest_list: &ManifestList,
     ) -> DataFusionResult<Vec<DataFile>> {
@@ -184,17 +186,38 @@ impl IcebergTableProvider {
             // Get partition_spec_id from manifest file
             let partition_spec_id = manifest_file.partition_spec_id;
 
-            for entry_ref in manifest.entries() {
-                let entry = entry_ref.as_ref();
-                if matches!(
-                    entry.status,
-                    ManifestStatus::Added | ManifestStatus::Existing
-                ) {
-                    let mut df = entry.data_file.clone();
-                    // overwrite partition_spec_id from manifest list file
-                    df.partition_spec_id = partition_spec_id;
-                    data_files.push(df);
-                }
+            // Collect data files for this manifest
+            let manifest_data_files: Vec<DataFile> = manifest
+                .entries()
+                .iter()
+                .filter_map(|entry_ref| {
+                    let entry = entry_ref.as_ref();
+                    if matches!(
+                        entry.status,
+                        ManifestStatus::Added | ManifestStatus::Existing
+                    ) {
+                        let mut df = entry.data_file.clone();
+                        df.partition_spec_id = partition_spec_id;
+                        Some(df)
+                    } else {
+                        None
+                    }
+                })
+                .collect();
+
+            // Early prune at manifest entry level using DataFusion predicate over metrics
+            if !filters.is_empty() {
+                let (kept, _mask) = crate::datasource::pruning::prune_files(
+                    session,
+                    filters,
+                    None,
+                    self.arrow_schema.clone(),
+                    manifest_data_files,
+                    &self.schema,
+                )?;
+                data_files.extend(kept);
+            } else {
+                data_files.extend(manifest_data_files);
             }
         }
 
@@ -511,7 +534,9 @@ impl TableProvider for IcebergTableProvider {
         );
 
         log::info!("[ICEBERG] Loading data files from manifests...");
-        let mut data_files = self.load_data_files(&object_store, &manifest_list).await?;
+        let mut data_files = self
+            .load_data_files(session, _filters, &object_store, &manifest_list)
+            .await?;
         log::info!("[ICEBERG] Loaded {} data files", data_files.len());
 
         // TODO: Manifest-level pruning using partition summaries to avoid loading all files
diff --git a/crates/sail-iceberg/src/datasource/pruning.rs b/crates/sail-iceberg/src/datasource/pruning.rs
index 2588e1be14..b6cb41eae5 100644
--- a/crates/sail-iceberg/src/datasource/pruning.rs
+++ b/crates/sail-iceberg/src/datasource/pruning.rs
@@ -11,9 +11,11 @@ use datafusion::logical_expr::Expr;
 use datafusion::physical_optimizer::pruning::PruningPredicate;
 
 use crate::spec::types::values::{Datum, Literal};
-use crate::spec::{DataFile, Schema};
+use crate::spec::{DataFile, Manifest, ManifestContentType, ManifestList, Schema};
 
-fn literal_to_scalar_value_local(literal: &Literal) -> datafusion::common::scalar::ScalarValue {
+pub(crate) fn literal_to_scalar_value_local(
+    literal: &Literal,
+) -> datafusion::common::scalar::ScalarValue {
     match literal {
         Literal::Primitive(p) => match p {
             crate::spec::types::values::PrimitiveLiteral::Boolean(v) => {
@@ -202,3 +204,38 @@ pub fn prune_files(
 
     Ok((kept, Some(files_to_keep)))
 }
+
+/// Manifest-level pruning using partition summaries from ManifestList
+pub fn prune_manifests_by_partition_summaries<'a>(
+    manifest_list: &'a ManifestList,
+    _schema: &Schema,
+    _filters: &[Expr],
+) -> Vec<&'a crate::spec::manifest_list::ManifestFile> {
+    // TODO: Evaluate filters against `ManifestFile.partitions` FieldSummary to drop manifests early
+    manifest_list
+        .entries()
+        .iter()
+        .filter(|mf| mf.content == ManifestContentType::Data)
+        .collect()
+}
+
+/// Load a manifest and prune entries by partition+metrics
+pub fn prune_manifest_entries(
+    manifest: &Manifest,
+    _schema: &Schema,
+    _filters: &[Expr],
+) -> Vec<DataFile> {
+    // TODO: Partition-transform awareness and metrics-only prune at manifest entry granularity
+    manifest
+        .entries()
+        .iter()
+        .filter(|e| {
+            matches!(
+                e.status,
+                crate::spec::manifest::ManifestStatus::Added
+                    | crate::spec::manifest::ManifestStatus::Existing
+            )
+        })
+        .map(|e| e.data_file.clone())
+        .collect()
+}

From 5dbf1135a9bf0b203dba1543b4301557c9b09293 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Mon, 13 Oct 2025 16:14:40 +0800
Subject: [PATCH 17/32] update

---
 .../sail-iceberg/src/datasource/provider.rs   |  31 +-
 crates/sail-iceberg/src/datasource/pruning.rs | 476 +++++++++++++++++-
 crates/sail-iceberg/src/table_format.rs       |  20 +-
 3 files changed, 507 insertions(+), 20 deletions(-)

diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
index 0ab400ed37..4ba2718a94 100644
--- a/crates/sail-iceberg/src/datasource/provider.rs
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -25,11 +25,11 @@ use url::Url;
 
 use crate::arrow_conversion::iceberg_schema_to_arrow;
 use crate::datasource::expressions::simplify_expr;
-use crate::datasource::pruning::prune_files;
+use crate::datasource::pruning::{prune_files, prune_manifests_by_partition_summaries};
 use crate::spec::types::values::{Literal, PrimitiveLiteral};
 use crate::spec::{
-    DataFile, FormatVersion, Manifest, ManifestContentType, ManifestList, ManifestStatus, Schema,
-    Snapshot,
+    DataFile, FormatVersion, Manifest, ManifestContentType, ManifestList, ManifestStatus,
+    PartitionSpec, Schema, Snapshot,
 };
 
 /// Iceberg table provider for DataFusion
@@ -41,6 +41,9 @@ pub struct IcebergTableProvider {
     schema: Schema,
     /// The current snapshot of the table
     snapshot: Snapshot,
+    /// All partition specs referenced by the table
+    #[allow(unused)]
+    partition_specs: Vec<PartitionSpec>,
     /// Arrow schema for DataFusion
     arrow_schema: Arc<ArrowSchema>,
 }
@@ -51,6 +54,7 @@ impl IcebergTableProvider {
         table_uri: impl ToString,
         schema: Schema,
         snapshot: Snapshot,
+        partition_specs: Vec<PartitionSpec>,
     ) -> DataFusionResult<Self> {
         let table_uri_str = table_uri.to_string();
         log::info!("[ICEBERG] Creating table provider for: {}", table_uri_str);
@@ -69,6 +73,7 @@ impl IcebergTableProvider {
             table_uri: table_uri_str,
             schema,
             snapshot,
+            partition_specs,
             arrow_schema,
         })
     }
@@ -151,7 +156,16 @@ impl IcebergTableProvider {
     ) -> DataFusionResult<Vec<DataFile>> {
         let mut data_files = Vec::new();
 
-        for manifest_file in manifest_list.entries() {
+        // Build partition spec map for summary pruning
+        let spec_map: HashMap<i32, PartitionSpec> = self
+            .partition_specs
+            .iter()
+            .map(|s| (s.spec_id(), s.clone()))
+            .collect();
+        let manifest_files =
+            prune_manifests_by_partition_summaries(manifest_list, &self.schema, &spec_map, filters);
+
+        for manifest_file in manifest_files {
             // TODO: Support delete manifests
             if manifest_file.content != ManifestContentType::Data {
                 continue;
@@ -310,13 +324,7 @@ impl IcebergTableProvider {
         let mut total_bytes: usize = 0;
 
         // Pre-compute field id per column index
-        let field_ids: Vec<i32> = self
-            .schema
-            .fields()
-            .iter()
-            .enumerate()
-            .map(|(_, f)| f.id)
-            .collect();
+        let field_ids: Vec<i32> = self.schema.fields().iter().map(|f| f.id).collect();
 
         // Initialize accumulators per column
         let mut min_scalars: Vec<Option<ScalarValue>> =
@@ -604,6 +612,7 @@ impl TableProvider for IcebergTableProvider {
         let parquet_source = Arc::new(parquet_source);
 
         // Build table statistics from pruned files
+        // TODO: Include partition-level stats and handle unknowns conservatively
         let table_stats = self.aggregate_statistics(&data_files);
 
         let file_scan_config =
diff --git a/crates/sail-iceberg/src/datasource/pruning.rs b/crates/sail-iceberg/src/datasource/pruning.rs
index b6cb41eae5..eed016c758 100644
--- a/crates/sail-iceberg/src/datasource/pruning.rs
+++ b/crates/sail-iceberg/src/datasource/pruning.rs
@@ -7,15 +7,19 @@ use datafusion::catalog::Session;
 use datafusion::common::pruning::PruningStatistics;
 use datafusion::common::{Column, Result, ToDFSchema};
 use datafusion::logical_expr::utils::conjunction;
-use datafusion::logical_expr::Expr;
+use datafusion::logical_expr::{BinaryExpr, Expr, Operator};
 use datafusion::physical_optimizer::pruning::PruningPredicate;
 
+use crate::spec::partition::PartitionSpec;
 use crate::spec::types::values::{Datum, Literal};
+use crate::spec::types::{PrimitiveType, Type};
 use crate::spec::{DataFile, Manifest, ManifestContentType, ManifestList, Schema};
+// TODO: Consider parsing logical expressions more robustly for summary pruning
 
 pub(crate) fn literal_to_scalar_value_local(
     literal: &Literal,
 ) -> datafusion::common::scalar::ScalarValue {
+    // TODO: Extend conversion to cover Decimal/UUID/Fixed with precise semantics and timezones
     match literal {
         Literal::Primitive(p) => match p {
             crate::spec::types::values::PrimitiveLiteral::Boolean(v) => {
@@ -97,12 +101,14 @@ impl IcebergPruningStats {
 
     fn datum_to_scalar(&self, datum: &Datum) -> datafusion::common::scalar::ScalarValue {
         // Reuse existing literal conversion via Datum.literal
+        // TODO: Avoid allocations by caching conversions per field id
         literal_to_scalar_value_local(&Literal::Primitive(datum.literal.clone()))
     }
 }
 
 impl PruningStatistics for IcebergPruningStats {
     fn min_values(&self, column: &Column) -> Option<ArrayRef> {
+        // TODO: Materialize arrays only for columns referenced by the predicate
         let field_id = self.field_id_for(column)?;
         let scalars = self.files.iter().map(|f| {
             f.lower_bounds()
@@ -208,14 +214,139 @@ pub fn prune_files(
 /// Manifest-level pruning using partition summaries from ManifestList
 pub fn prune_manifests_by_partition_summaries<'a>(
     manifest_list: &'a ManifestList,
-    _schema: &Schema,
-    _filters: &[Expr],
+    table_schema: &Schema,
+    partition_specs: &std::collections::HashMap<i32, PartitionSpec>,
+    filters: &[Expr],
 ) -> Vec<&'a crate::spec::manifest_list::ManifestFile> {
-    // TODO: Evaluate filters against `ManifestFile.partitions` FieldSummary to drop manifests early
+    // TODO: Support non-identity transforms (day/month/hour/bucket/truncate)
+    let eq_filters = collect_identity_eq_filters(table_schema, filters);
+    let in_filters = collect_identity_in_filters(table_schema, filters);
+    let range_filters = collect_identity_range_filters(table_schema, filters);
     manifest_list
         .entries()
         .iter()
         .filter(|mf| mf.content == ManifestContentType::Data)
+        .filter(|mf| {
+            // If no simple identity eq filters, keep manifest
+            if eq_filters.is_empty() {
+                return true;
+            }
+            let Some(spec) = partition_specs.get(&mf.partition_spec_id) else {
+                return true;
+            };
+            let Some(part_summaries) = mf.partitions.as_ref() else {
+                return true;
+            };
+            // Build partition field result types
+            let part_type = match spec.partition_type(table_schema) {
+                Ok(t) => t,
+                Err(_) => return true,
+            };
+            // Evaluate equality filters; if any contradicts summaries, drop manifest
+            for (source_id, lit) in &eq_filters {
+                // find partition field with identity transform sourcing this column
+                if let Some((idx, _pf)) = spec.fields().iter().enumerate().find(|(_, pf)| {
+                    pf.source_id == *source_id
+                        && matches!(pf.transform, crate::spec::transform::Transform::Identity)
+                }) {
+                    if let Some(summary) = part_summaries.get(idx) {
+                        // decode bounds according to partition field type
+                        let field_ty = part_type.fields().get(idx).map(|nf| nf.field_type.as_ref());
+                        if let Some(Type::Primitive(prim_ty)) = field_ty {
+                            // TODO: Handle contains_null/contains_nan from FieldSummary
+                            let lower = summary
+                                .lower_bound_bytes
+                                .as_ref()
+                                .and_then(|b| decode_bound_bytes(prim_ty, b).ok());
+                            let upper = summary
+                                .upper_bound_bytes
+                                .as_ref()
+                                .and_then(|b| decode_bound_bytes(prim_ty, b).ok());
+
+                            if let (Some(lb), Some(ub)) = (lower.as_ref(), upper.as_ref()) {
+                                // if lit < lb or lit > ub, cannot match
+                                if lt_prim(lit, lb) || gt_prim(lit, ub) {
+                                    return false;
+                                }
+                            } else {
+                                // TODO: If only one bound is present, use it for pruning when safe
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Evaluate IN-list filters: require intersection with [lb, ub]
+            for (source_id, lits) in &in_filters {
+                if let Some((idx, _pf)) = spec.fields().iter().enumerate().find(|(_, pf)| {
+                    pf.source_id == *source_id
+                        && matches!(pf.transform, crate::spec::transform::Transform::Identity)
+                }) {
+                    if let Some(summary) = part_summaries.get(idx) {
+                        let field_ty = part_type.fields().get(idx).map(|nf| nf.field_type.as_ref());
+                        if let Some(Type::Primitive(prim_ty)) = field_ty {
+                            let lower = summary
+                                .lower_bound_bytes
+                                .as_ref()
+                                .and_then(|b| decode_bound_bytes(prim_ty, b).ok());
+                            let upper = summary
+                                .upper_bound_bytes
+                                .as_ref()
+                                .and_then(|b| decode_bound_bytes(prim_ty, b).ok());
+                            if let (Some(lb), Some(ub)) = (lower.as_ref(), upper.as_ref()) {
+                                let mut any_in = false;
+                                for lit in lits {
+                                    if !(lt_prim(lit, lb) || gt_prim(lit, ub)) {
+                                        any_in = true;
+                                        break;
+                                    }
+                                }
+                                if !any_in {
+                                    return false;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Evaluate simple range filters: require overlap with [lb, ub]
+            for (source_id, range) in &range_filters {
+                if let Some((idx, _pf)) = spec.fields().iter().enumerate().find(|(_, pf)| {
+                    pf.source_id == *source_id
+                        && matches!(pf.transform, crate::spec::transform::Transform::Identity)
+                }) {
+                    if let Some(summary) = part_summaries.get(idx) {
+                        let field_ty = part_type.fields().get(idx).map(|nf| nf.field_type.as_ref());
+                        if let Some(Type::Primitive(prim_ty)) = field_ty {
+                            let lower = summary
+                                .lower_bound_bytes
+                                .as_ref()
+                                .and_then(|b| decode_bound_bytes(prim_ty, b).ok());
+                            let upper = summary
+                                .upper_bound_bytes
+                                .as_ref()
+                                .and_then(|b| decode_bound_bytes(prim_ty, b).ok());
+                            if let (Some(lb), Some(ub)) = (lower.as_ref(), upper.as_ref()) {
+                                // manifest range [lb, ub]
+                                // query range [min, max]
+                                if let Some((ref qmin, _incl_min)) = range.min {
+                                    if gt_prim(qmin, ub) {
+                                        return false;
+                                    }
+                                }
+                                if let Some((ref qmax, _incl_max)) = range.max {
+                                    if lt_prim(qmax, lb) {
+                                        return false;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            true
+        })
         .collect()
 }
 
@@ -239,3 +370,340 @@ pub fn prune_manifest_entries(
         .map(|e| e.data_file.clone())
         .collect()
 }
+
+fn collect_identity_eq_filters(
+    schema: &Schema,
+    filters: &[Expr],
+) -> Vec<(i32, crate::spec::types::values::PrimitiveLiteral)> {
+    // returns Vec of (source_id, literal) for Exprs of form col = literal
+    fn strip(expr: &Expr) -> &Expr {
+        match expr {
+            Expr::Cast(c) => strip(&c.expr),
+            Expr::Alias(a) => strip(&a.expr),
+            _ => expr,
+        }
+    }
+
+    let mut result = Vec::new();
+
+    fn visit_expr(
+        acc: &mut Vec<(i32, crate::spec::types::values::PrimitiveLiteral)>,
+        schema: &Schema,
+        e: &Expr,
+    ) {
+        match e {
+            Expr::BinaryExpr(BinaryExpr { left, op, right }) if *op == Operator::Eq => {
+                let l = strip(left);
+                let r = strip(right);
+
+                // col = lit
+                if let Expr::Column(c) = l {
+                    if let Expr::Literal(sv, _) = r {
+                        let col_name = c.name.clone();
+                        if let Some(field) = schema.field_by_name(&col_name) {
+                            if let Some(pl) = scalar_to_primitive_literal(sv) {
+                                acc.push((field.id, pl));
+                                return;
+                            }
+                        }
+                    }
+                }
+
+                // lit = col
+                if let Expr::Literal(sv, _) = l {
+                    if let Expr::Column(c) = r {
+                        let col_name = c.name.clone();
+                        if let Some(field) = schema.field_by_name(&col_name) {
+                            if let Some(pl) = scalar_to_primitive_literal(sv) {
+                                acc.push((field.id, pl));
+                            }
+                        }
+                    }
+                }
+            }
+            Expr::BinaryExpr(BinaryExpr { left, op, right }) if *op == Operator::And => {
+                visit_expr(acc, schema, left);
+                visit_expr(acc, schema, right);
+            }
+            _ => {}
+        }
+    }
+
+    for expr in filters {
+        visit_expr(&mut result, schema, expr);
+    }
+    result
+}
+
+fn collect_identity_in_filters(
+    schema: &Schema,
+    filters: &[Expr],
+) -> std::collections::HashMap<i32, Vec<crate::spec::types::values::PrimitiveLiteral>> {
+    fn strip(expr: &Expr) -> &Expr {
+        match expr {
+            Expr::Cast(c) => strip(&c.expr),
+            Expr::Alias(a) => strip(&a.expr),
+            _ => expr,
+        }
+    }
+
+    let mut result: std::collections::HashMap<_, Vec<_>> = std::collections::HashMap::new();
+
+    fn visit_expr(
+        acc: &mut std::collections::HashMap<i32, Vec<crate::spec::types::values::PrimitiveLiteral>>,
+        schema: &Schema,
+        e: &Expr,
+    ) {
+        match e {
+            Expr::InList(in_list) if !in_list.negated => {
+                let e = strip(&in_list.expr);
+                if let Expr::Column(c) = e {
+                    if let Some(field) = schema.field_by_name(&c.name) {
+                        let mut vals = Vec::new();
+                        for item in &in_list.list {
+                            if let Expr::Literal(ref sv, _) = item {
+                                if let Some(pl) = scalar_to_primitive_literal(sv) {
+                                    vals.push(pl);
+                                }
+                            }
+                        }
+                        if !vals.is_empty() {
+                            acc.entry(field.id).or_default().extend(vals);
+                        }
+                    }
+                }
+            }
+            Expr::BinaryExpr(BinaryExpr { left, op, right }) if *op == Operator::And => {
+                visit_expr(acc, schema, left);
+                visit_expr(acc, schema, right);
+            }
+            _ => {}
+        }
+    }
+
+    for expr in filters {
+        visit_expr(&mut result, schema, expr);
+    }
+    result
+}
+
+#[derive(Clone, Default)]
+struct RangeConstraint {
+    min: Option<(crate::spec::types::values::PrimitiveLiteral, bool)>,
+    max: Option<(crate::spec::types::values::PrimitiveLiteral, bool)>,
+}
+
+fn collect_identity_range_filters(
+    schema: &Schema,
+    filters: &[Expr],
+) -> std::collections::HashMap<i32, RangeConstraint> {
+    fn strip(expr: &Expr) -> &Expr {
+        match expr {
+            Expr::Cast(c) => strip(&c.expr),
+            Expr::Alias(a) => strip(&a.expr),
+            _ => expr,
+        }
+    }
+
+    fn tighten_min(
+        cur: &mut Option<(crate::spec::types::values::PrimitiveLiteral, bool)>,
+        cand: (crate::spec::types::values::PrimitiveLiteral, bool),
+    ) {
+        match cur {
+            None => *cur = Some(cand),
+            Some((ref mut v, ref mut incl)) => {
+                if gt_prim(&cand.0, v) || (eq_prim(&cand.0, v) && cand.1 && !*incl) {
+                    *v = cand.0;
+                    *incl = cand.1;
+                }
+            }
+        }
+    }
+    fn tighten_max(
+        cur: &mut Option<(crate::spec::types::values::PrimitiveLiteral, bool)>,
+        cand: (crate::spec::types::values::PrimitiveLiteral, bool),
+    ) {
+        match cur {
+            None => *cur = Some(cand),
+            Some((ref mut v, ref mut incl)) => {
+                if lt_prim(&cand.0, v) || (eq_prim(&cand.0, v) && cand.1 && !*incl) {
+                    *v = cand.0;
+                    *incl = cand.1;
+                }
+            }
+        }
+    }
+
+    let mut result: std::collections::HashMap<i32, RangeConstraint> =
+        std::collections::HashMap::new();
+
+    fn visit_expr(
+        acc: &mut std::collections::HashMap<i32, RangeConstraint>,
+        schema: &Schema,
+        e: &Expr,
+    ) {
+        if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = e {
+            let l = strip(left);
+            let r = strip(right);
+            match op {
+                Operator::Gt | Operator::GtEq => {
+                    if let (Expr::Column(c), Expr::Literal(sv, _)) = (l, r) {
+                        if let Some(field) = schema.field_by_name(&c.name) {
+                            if let Some(pl) = scalar_to_primitive_literal(sv) {
+                                let entry = acc.entry(field.id).or_default();
+                                tighten_min(&mut entry.min, (pl, *op == Operator::GtEq));
+                            }
+                        }
+                    }
+                }
+                Operator::Lt | Operator::LtEq => {
+                    if let (Expr::Column(c), Expr::Literal(sv, _)) = (l, r) {
+                        if let Some(field) = schema.field_by_name(&c.name) {
+                            if let Some(pl) = scalar_to_primitive_literal(sv) {
+                                let entry = acc.entry(field.id).or_default();
+                                tighten_max(&mut entry.max, (pl, *op == Operator::LtEq));
+                            }
+                        }
+                    }
+                }
+                Operator::And => {
+                    visit_expr(acc, schema, l);
+                    visit_expr(acc, schema, r);
+                }
+                _ => {}
+            }
+        }
+    }
+
+    for expr in filters {
+        visit_expr(&mut result, schema, expr);
+    }
+    result
+}
+
+fn scalar_to_primitive_literal(
+    sv: &datafusion::common::scalar::ScalarValue,
+) -> Option<crate::spec::types::values::PrimitiveLiteral> {
+    use crate::spec::types::values::PrimitiveLiteral::*;
+    match sv {
+        datafusion::common::scalar::ScalarValue::Boolean(Some(v)) => Some(Boolean(*v)),
+        datafusion::common::scalar::ScalarValue::Int32(Some(v)) => Some(Int(*v)),
+        datafusion::common::scalar::ScalarValue::Int64(Some(v)) => Some(Long(*v)),
+        datafusion::common::scalar::ScalarValue::Float32(Some(v)) => {
+            Some(Float(ordered_float::OrderedFloat(*v)))
+        }
+        datafusion::common::scalar::ScalarValue::Float64(Some(v)) => {
+            Some(Double(ordered_float::OrderedFloat(*v)))
+        }
+        datafusion::common::scalar::ScalarValue::Utf8(Some(s)) => Some(String(s.clone())),
+        datafusion::common::scalar::ScalarValue::LargeUtf8(Some(s)) => Some(String(s.clone())),
+        datafusion::common::scalar::ScalarValue::Binary(Some(b)) => Some(Binary(b.clone())),
+        _ => None,
+    }
+}
+
+fn decode_bound_bytes(
+    ty: &PrimitiveType,
+    bytes: &[u8],
+) -> Result<crate::spec::types::values::PrimitiveLiteral, String> {
+    use crate::spec::types::values::PrimitiveLiteral as PL;
+    let pl = match ty {
+        PrimitiveType::Boolean => {
+            let val = !(bytes.len() == 1 && bytes[0] == 0u8);
+            PL::Boolean(val)
+        }
+        PrimitiveType::Int | PrimitiveType::Date => {
+            let val = i32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i32 bytes")?);
+            PL::Int(val)
+        }
+        PrimitiveType::Long
+        | PrimitiveType::Time
+        | PrimitiveType::Timestamp
+        | PrimitiveType::Timestamptz
+        | PrimitiveType::TimestampNs
+        | PrimitiveType::TimestamptzNs => {
+            let val = if bytes.len() == 4 {
+                i32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i32 bytes")?) as i64
+            } else {
+                i64::from_le_bytes(bytes.try_into().map_err(|_| "Invalid i64 bytes")?)
+            };
+            PL::Long(val)
+        }
+        PrimitiveType::Float => {
+            let val = f32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f32 bytes")?);
+            PL::Float(ordered_float::OrderedFloat(val))
+        }
+        PrimitiveType::Double => {
+            let val = if bytes.len() == 4 {
+                f32::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f32 bytes")?) as f64
+            } else {
+                f64::from_le_bytes(bytes.try_into().map_err(|_| "Invalid f64 bytes")?)
+            };
+            PL::Double(ordered_float::OrderedFloat(val))
+        }
+        PrimitiveType::String => {
+            let val = std::str::from_utf8(bytes)
+                .map_err(|_| "Invalid UTF-8")?
+                .to_string();
+            PL::String(val)
+        }
+        PrimitiveType::Uuid => {
+            return Err("uuid bound decoding not supported".to_string());
+        }
+        PrimitiveType::Fixed(_) | PrimitiveType::Binary => {
+            // Treat bounds as raw bytes for conservative comparisons (equality filters only)
+            PL::Binary(bytes.to_vec())
+        }
+        PrimitiveType::Decimal { .. } => {
+            return Err("decimal bound decoding not supported".to_string());
+        }
+    };
+    Ok(pl)
+}
+
+fn lt_prim(
+    a: &crate::spec::types::values::PrimitiveLiteral,
+    b: &crate::spec::types::values::PrimitiveLiteral,
+) -> bool {
+    use crate::spec::types::values::PrimitiveLiteral as PL;
+    match (a, b) {
+        (PL::Int(x), PL::Int(y)) => x < y,
+        (PL::Long(x), PL::Long(y)) => x < y,
+        (PL::Float(x), PL::Float(y)) => x < y,
+        (PL::Double(x), PL::Double(y)) => x < y,
+        (PL::String(x), PL::String(y)) => x < y,
+        _ => false,
+    }
+}
+
+fn gt_prim(
+    a: &crate::spec::types::values::PrimitiveLiteral,
+    b: &crate::spec::types::values::PrimitiveLiteral,
+) -> bool {
+    use crate::spec::types::values::PrimitiveLiteral as PL;
+    match (a, b) {
+        (PL::Int(x), PL::Int(y)) => x > y,
+        (PL::Long(x), PL::Long(y)) => x > y,
+        (PL::Float(x), PL::Float(y)) => x > y,
+        (PL::Double(x), PL::Double(y)) => x > y,
+        (PL::String(x), PL::String(y)) => x > y,
+        _ => false,
+    }
+}
+
+fn eq_prim(
+    a: &crate::spec::types::values::PrimitiveLiteral,
+    b: &crate::spec::types::values::PrimitiveLiteral,
+) -> bool {
+    use crate::spec::types::values::PrimitiveLiteral as PL;
+    match (a, b) {
+        (PL::Int(x), PL::Int(y)) => x == y,
+        (PL::Long(x), PL::Long(y)) => x == y,
+        (PL::Float(x), PL::Float(y)) => x == y,
+        (PL::Double(x), PL::Double(y)) => x == y,
+        (PL::String(x), PL::String(y)) => x == y,
+        (PL::Binary(x), PL::Binary(y)) => x == y,
+        (PL::Boolean(x), PL::Boolean(y)) => x == y,
+        _ => false,
+    }
+}
diff --git a/crates/sail-iceberg/src/table_format.rs b/crates/sail-iceberg/src/table_format.rs
index 5a2e801766..c4da01bfe0 100644
--- a/crates/sail-iceberg/src/table_format.rs
+++ b/crates/sail-iceberg/src/table_format.rs
@@ -8,7 +8,7 @@ use sail_common_datafusion::datasource::{SinkInfo, SourceInfo, TableFormat};
 use url::Url;
 
 use crate::datasource::provider::IcebergTableProvider;
-use crate::spec::{Schema, Snapshot, TableMetadata};
+use crate::spec::{PartitionSpec, Schema, Snapshot, TableMetadata};
 
 #[derive(Debug)]
 pub struct IcebergTableFormat;
@@ -38,13 +38,19 @@ impl TableFormat for IcebergTableFormat {
         let table_url = Self::parse_table_url(ctx, paths).await?;
         log::info!("[ICEBERG] Parsed table URL: {}", table_url);
 
-        let (iceberg_schema, snapshot) = load_table_metadata(ctx, &table_url).await?;
+        let (iceberg_schema, snapshot, partition_specs) =
+            load_table_metadata(ctx, &table_url).await?;
         log::info!(
             "[ICEBERG] Loaded metadata, snapshot_id: {}",
             snapshot.snapshot_id()
         );
 
-        let provider = IcebergTableProvider::new(table_url.to_string(), iceberg_schema, snapshot)?;
+        let provider = IcebergTableProvider::new(
+            table_url.to_string(),
+            iceberg_schema,
+            snapshot,
+            partition_specs,
+        )?;
         Ok(Arc::new(provider))
     }
 
@@ -85,7 +91,10 @@ impl IcebergTableFormat {
 }
 
 /// Load Iceberg table metadata from the table location
-async fn load_table_metadata(ctx: &dyn Session, table_url: &Url) -> Result<(Schema, Snapshot)> {
+async fn load_table_metadata(
+    ctx: &dyn Session,
+    table_url: &Url,
+) -> Result<(Schema, Snapshot, Vec<PartitionSpec>)> {
     log::debug!("[ICEBERG] Loading table metadata from: {}", table_url);
     let object_store = ctx
         .runtime_env()
@@ -131,7 +140,8 @@ async fn load_table_metadata(ctx: &dyn Session, table_url: &Url) -> Result<(Sche
         })?
         .clone();
 
-    Ok((schema, snapshot))
+    let partition_specs = table_metadata.partition_specs.clone();
+    Ok((schema, snapshot, partition_specs))
 }
 
 /// Find the latest metadata file in the table location

From 22b2e123bdf4acc43f38337f8b2c9d5468a2d7a3 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Mon, 13 Oct 2025 16:38:56 +0800
Subject: [PATCH 18/32] test

---
 .../tests/spark/iceberg/test_iceberg_io.py    | 249 +++++++++---------
 .../spark/iceberg/test_iceberg_pruning.py     | 168 ++++++++++++
 2 files changed, 291 insertions(+), 126 deletions(-)
 create mode 100644 python/pysail/tests/spark/iceberg/test_iceberg_pruning.py

diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_io.py b/python/pysail/tests/spark/iceberg/test_iceberg_io.py
index cee0fe5b68..0df234e9bc 100644
--- a/python/pysail/tests/spark/iceberg/test_iceberg_io.py
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_io.py
@@ -7,157 +7,154 @@
 from pyiceberg.types import DoubleType, LongType, NestedField, StringType
 
 
-class TestIcebergIO:
-    @pytest.fixture(scope="class")
-    def iceberg_test_data(self):
-        return [
-            {"id": 10, "event": "A", "score": 0.98},
-            {"id": 11, "event": "B", "score": 0.54},
-            {"id": 12, "event": "A", "score": 0.76},
-        ]
-
-    @pytest.fixture(scope="class")
-    def expected_pandas_df(self):
-        return pd.DataFrame({"id": [10, 11, 12], "event": ["A", "B", "A"], "score": [0.98, 0.54, 0.76]}).astype(
-            {"id": "int64", "event": "string", "score": "float64"}
+@pytest.fixture
+def iceberg_test_data():
+    return [
+        {"id": 10, "event": "A", "score": 0.98},
+        {"id": 11, "event": "B", "score": 0.54},
+        {"id": 12, "event": "A", "score": 0.76},
+    ]
+
+
+@pytest.fixture
+def expected_pandas_df():
+    return pd.DataFrame({"id": [10, 11, 12], "event": ["A", "B", "A"], "score": [0.98, 0.54, 0.76]}).astype(
+        {"id": "int64", "event": "string", "score": "float64"}
+    )
+
+
+def test_iceberg_io_basic_read(spark, iceberg_test_data, expected_pandas_df, tmp_path):
+    warehouse_path = tmp_path / "warehouse"
+    warehouse_path.mkdir()
+    table_name = "test_table"
+
+    catalog = load_catalog(
+        "test_catalog",
+        type="sql",
+        uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
+        warehouse=f"file://{warehouse_path}",
+    )
+
+    catalog.create_namespace("default")
+
+    schema = Schema(
+        NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+        NestedField(field_id=2, name="event", field_type=StringType(), required=False),
+        NestedField(field_id=3, name="score", field_type=DoubleType(), required=False),
+    )
+
+    table = catalog.create_table(
+        identifier=f"default.{table_name}",
+        schema=schema,
+    )
+
+    try:
+        df = pd.DataFrame(iceberg_test_data)
+        arrow_table = pa.Table.from_pandas(df)
+        table.append(arrow_table)
+
+        table_path = table.location()
+
+        result_df = spark.read.format("iceberg").load(table_path).sort("id")
+
+        assert_frame_equal(
+            result_df.toPandas(), expected_pandas_df.sort_values("id").reset_index(drop=True), check_dtype=False
         )
+    finally:
+        catalog.drop_table(f"default.{table_name}")
 
-    def test_iceberg_io_basic_read(self, spark, iceberg_test_data, expected_pandas_df, tmp_path):
-        warehouse_path = tmp_path / "warehouse"
-        warehouse_path.mkdir()
-        table_name = "test_table"
 
-        catalog = load_catalog(
-            "test_catalog",
-            type="sql",
-            uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
-            warehouse=f"file://{warehouse_path}",
-        )
+def test_iceberg_io_read_with_sql(spark, iceberg_test_data, expected_pandas_df, tmp_path):
+    warehouse_path = tmp_path / "warehouse"
+    warehouse_path.mkdir()
+    table_name = "test_table_sql"
 
-        catalog.create_namespace("default")
+    catalog = load_catalog(
+        "test_catalog",
+        type="sql",
+        uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
+        warehouse=f"file://{warehouse_path}",
+    )
 
-        schema = Schema(
-            NestedField(field_id=1, name="id", field_type=LongType(), required=False),
-            NestedField(field_id=2, name="event", field_type=StringType(), required=False),
-            NestedField(field_id=3, name="score", field_type=DoubleType(), required=False),
-        )
+    catalog.create_namespace("default")
 
-        table = catalog.create_table(
-            identifier=f"default.{table_name}",
-            schema=schema,
-        )
+    schema = Schema(
+        NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+        NestedField(field_id=2, name="event", field_type=StringType(), required=False),
+        NestedField(field_id=3, name="score", field_type=DoubleType(), required=False),
+    )
 
-        try:
-            df = pd.DataFrame(iceberg_test_data)
-            arrow_table = pa.Table.from_pandas(df)
-            table.append(arrow_table)
+    table = catalog.create_table(
+        identifier=f"default.{table_name}",
+        schema=schema,
+    )
 
-            table_location = table.metadata_location
-            # TODO: Keep file:// prefix for Sail, just remove /metadata/... suffix
-            table_path = table_location.rsplit("/metadata/", 1)[0]
+    try:
+        df = pd.DataFrame(iceberg_test_data)
+        arrow_table = pa.Table.from_pandas(df)
+        table.append(arrow_table)
 
-            result_df = spark.read.format("iceberg").load(table_path).sort("id")
+        table_path = table.location()
+
+        spark.sql(f"CREATE TABLE my_iceberg USING iceberg LOCATION '{table_path}'")
+
+        try:
+            result_df = spark.sql("SELECT * FROM my_iceberg").sort("id")
 
             assert_frame_equal(
                 result_df.toPandas(), expected_pandas_df.sort_values("id").reset_index(drop=True), check_dtype=False
             )
         finally:
-            catalog.drop_table(f"default.{table_name}")
-
-    def test_iceberg_io_read_with_sql(self, spark, iceberg_test_data, expected_pandas_df, tmp_path):
-        warehouse_path = tmp_path / "warehouse"
-        warehouse_path.mkdir()
-        table_name = "test_table_sql"
-
-        catalog = load_catalog(
-            "test_catalog",
-            type="sql",
-            uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
-            warehouse=f"file://{warehouse_path}",
-        )
+            spark.sql("DROP TABLE IF EXISTS my_iceberg")
+    finally:
+        catalog.drop_table(f"default.{table_name}")
 
-        catalog.create_namespace("default")
 
-        schema = Schema(
-            NestedField(field_id=1, name="id", field_type=LongType(), required=False),
-            NestedField(field_id=2, name="event", field_type=StringType(), required=False),
-            NestedField(field_id=3, name="score", field_type=DoubleType(), required=False),
-        )
+def test_iceberg_io_multiple_files(spark, tmp_path):
+    warehouse_path = tmp_path / "warehouse"
+    warehouse_path.mkdir()
+    table_name = "test_table_multiple"
 
-        table = catalog.create_table(
-            identifier=f"default.{table_name}",
-            schema=schema,
-        )
+    catalog = load_catalog(
+        "test_catalog",
+        type="sql",
+        uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
+        warehouse=f"file://{warehouse_path}",
+    )
 
-        try:
-            df = pd.DataFrame(iceberg_test_data)
-            arrow_table = pa.Table.from_pandas(df)
-            table.append(arrow_table)
+    catalog.create_namespace("default")
 
-            table_location = table.metadata_location
-            # TODO: Keep file:// prefix for Sail, just remove /metadata/... suffix
-            table_path = table_location.rsplit("/metadata/", 1)[0]
+    schema = Schema(
+        NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+        NestedField(field_id=2, name="value", field_type=StringType(), required=False),
+    )
 
-            spark.sql(f"CREATE TABLE my_iceberg USING iceberg LOCATION '{table_path}'")
+    table = catalog.create_table(
+        identifier=f"default.{table_name}",
+        schema=schema,
+    )
 
-            try:
-                result_df = spark.sql("SELECT * FROM my_iceberg").sort("id")
+    try:
+        df1 = pd.DataFrame([{"id": 1, "value": "a"}, {"id": 2, "value": "b"}])
+        arrow_table1 = pa.Table.from_pandas(df1)
+        table.append(arrow_table1)
 
-                assert_frame_equal(
-                    result_df.toPandas(), expected_pandas_df.sort_values("id").reset_index(drop=True), check_dtype=False
-                )
-            finally:
-                spark.sql("DROP TABLE IF EXISTS my_iceberg")
-        finally:
-            catalog.drop_table(f"default.{table_name}")
-
-    def test_iceberg_io_multiple_files(self, spark, tmp_path):
-        warehouse_path = tmp_path / "warehouse"
-        warehouse_path.mkdir()
-        table_name = "test_table_multiple"
-
-        catalog = load_catalog(
-            "test_catalog",
-            type="sql",
-            uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
-            warehouse=f"file://{warehouse_path}",
-        )
+        df2 = pd.DataFrame([{"id": 3, "value": "c"}, {"id": 4, "value": "d"}])
+        arrow_table2 = pa.Table.from_pandas(df2)
+        table.append(arrow_table2)
 
-        catalog.create_namespace("default")
+        table_path = table.location()
 
-        schema = Schema(
-            NestedField(field_id=1, name="id", field_type=LongType(), required=False),
-            NestedField(field_id=2, name="value", field_type=StringType(), required=False),
-        )
+        result_df = spark.read.format("iceberg").load(table_path).sort("id")
 
-        table = catalog.create_table(
-            identifier=f"default.{table_name}",
-            schema=schema,
+        expected_data = pd.DataFrame({"id": [1, 2, 3, 4], "value": ["a", "b", "c", "d"]}).astype(
+            {"id": "int64", "value": "string"}
         )
 
-        try:
-            df1 = pd.DataFrame([{"id": 1, "value": "a"}, {"id": 2, "value": "b"}])
-            arrow_table1 = pa.Table.from_pandas(df1)
-            table.append(arrow_table1)
-
-            df2 = pd.DataFrame([{"id": 3, "value": "c"}, {"id": 4, "value": "d"}])
-            arrow_table2 = pa.Table.from_pandas(df2)
-            table.append(arrow_table2)
-
-            table_location = table.metadata_location
-            # TODO: Keep file:// prefix for Sail, just remove /metadata/... suffix
-            table_path = table_location.rsplit("/metadata/", 1)[0]
-
-            result_df = spark.read.format("iceberg").load(table_path).sort("id")
-
-            expected_data = pd.DataFrame({"id": [1, 2, 3, 4], "value": ["a", "b", "c", "d"]}).astype(
-                {"id": "int64", "value": "string"}
-            )
-
-            assert_frame_equal(
-                result_df.toPandas(), expected_data.sort_values("id").reset_index(drop=True), check_dtype=False
-            )
+        assert_frame_equal(
+            result_df.toPandas(), expected_data.sort_values("id").reset_index(drop=True), check_dtype=False
+        )
 
-            assert result_df.count() == 4  # noqa: PLR2004
-        finally:
-            catalog.drop_table(f"default.{table_name}")
+        assert result_df.count() == 4  # noqa: PLR2004
+    finally:
+        catalog.drop_table(f"default.{table_name}")
diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py b/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py
new file mode 100644
index 0000000000..827ac95ebf
--- /dev/null
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py
@@ -0,0 +1,168 @@
+# ruff: noqa
+import pandas as pd
+import pyarrow as pa
+from pyiceberg.catalog import load_catalog
+from pyiceberg.schema import Schema
+from pyiceberg.types import BooleanType, DoubleType, IntegerType, LongType, NestedField, StringType
+
+
+def _create_catalog(tmp_path):
+    warehouse_path = tmp_path / "warehouse"
+    warehouse_path.mkdir()
+    catalog = load_catalog(
+        "test_catalog",
+        type="sql",
+        uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
+        warehouse=f"file://{warehouse_path}",
+    )
+    catalog.create_namespace("default")
+    return catalog
+
+
+def _extract_files_scanned(explain_output):
+    for line in explain_output.split("\n"):
+        if "files scanned" in line.lower() or "file" in line.lower():
+            parts = line.split()
+            for i, part in enumerate(parts):
+                if part.isdigit() and i + 2 < len(parts) and parts[i + 2].lower() in ["files", "file"]:
+                    return int(part)
+    return None
+
+
+def test_equality_and_in(spark, tmp_path):
+    catalog = _create_catalog(tmp_path)
+    table = catalog.create_table(
+        identifier="default.prune_eq_in",
+        schema=Schema(
+            NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+            NestedField(field_id=2, name="year", field_type=IntegerType(), required=False),
+            NestedField(field_id=3, name="month", field_type=IntegerType(), required=False),
+            NestedField(field_id=4, name="value", field_type=StringType(), required=False),
+        ),
+    )
+    try:
+        batches = [
+            pd.DataFrame({"id": [1, 2], "year": [2023, 2023], "month": [1, 1], "value": ["a", "b"]}),
+            pd.DataFrame({"id": [3, 4], "year": [2023, 2023], "month": [2, 2], "value": ["c", "d"]}),
+            pd.DataFrame({"id": [5, 6], "year": [2024, 2024], "month": [1, 1], "value": ["e", "f"]}),
+            pd.DataFrame({"id": [7, 8], "year": [2024, 2024], "month": [2, 2], "value": ["g", "h"]}),
+        ]
+        for df in batches:
+            df = df.astype({"id": "int64", "year": "int32", "month": "int32"})
+            table.append(pa.Table.from_pandas(df))
+
+        tp = table.location()
+
+        df = spark.read.format("iceberg").load(tp).filter("year = 2023")
+        assert df.count() == 4
+
+        df = spark.read.format("iceberg").load(tp).filter("year = 2023 AND month = 1")
+        assert df.count() == 2
+
+        df = spark.read.format("iceberg").load(tp).filter("month IN (2)")
+        assert df.count() == 4
+    finally:
+        catalog.drop_table("default.prune_eq_in")
+
+
+def test_comparison_and_between(spark, tmp_path):
+    catalog = _create_catalog(tmp_path)
+    table = catalog.create_table(
+        identifier="default.prune_cmp",
+        schema=Schema(
+            NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+            NestedField(field_id=2, name="year", field_type=IntegerType(), required=False),
+            NestedField(field_id=3, name="month", field_type=IntegerType(), required=False),
+        ),
+    )
+    try:
+        data = []
+        for year in [2021, 2022, 2023, 2024]:
+            for month in [1, 6, 12]:
+                data.append({"id": len(data) + 1, "year": year, "month": month})
+        for i in range(0, len(data), 6):
+            batch = pd.DataFrame(data[i : i + 6]).astype({"id": "int64", "year": "int32", "month": "int32"})
+            table.append(pa.Table.from_pandas(batch))
+
+        tp = table.location()
+
+        df = spark.read.format("iceberg").load(tp).filter("year > 2022")
+        assert df.count() == 6
+
+        df = spark.read.format("iceberg").load(tp).filter("year BETWEEN 2022 AND 2023")
+        assert df.count() == 6
+
+        df = spark.read.format("iceberg").load(tp).filter("year >= 2023 AND month >= 6")
+        assert df.count() == 4
+    finally:
+        catalog.drop_table("default.prune_cmp")
+
+
+def test_null_and_boolean(spark, tmp_path):
+    catalog = _create_catalog(tmp_path)
+    table = catalog.create_table(
+        identifier="default.prune_null_bool",
+        schema=Schema(
+            NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+            NestedField(field_id=2, name="region", field_type=StringType(), required=False),
+            NestedField(field_id=3, name="active", field_type=BooleanType(), required=False),
+        ),
+    )
+    try:
+        table.append(
+            pa.Table.from_pandas(
+                pd.DataFrame([{"id": 1, "region": None, "active": True}, {"id": 2, "region": None, "active": True}])
+            )
+        )
+        table.append(
+            pa.Table.from_pandas(
+                pd.DataFrame([{"id": 3, "region": "US", "active": False}, {"id": 4, "region": "EU", "active": False}])
+            )
+        )
+
+        tp = table.location()
+
+        df = spark.read.format("iceberg").load(tp).filter("region IS NULL")
+        assert df.count() == 2
+
+        df = spark.read.format("iceberg").load(tp).filter("active = true")
+        assert df.count() == 2
+    finally:
+        catalog.drop_table("default.prune_null_bool")
+
+
+def test_correctness_small(spark, tmp_path):
+    catalog = _create_catalog(tmp_path)
+    table = catalog.create_table(
+        identifier="default.prune_correct",
+        schema=Schema(
+            NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+            NestedField(field_id=2, name="year", field_type=IntegerType(), required=False),
+            NestedField(field_id=3, name="month", field_type=IntegerType(), required=False),
+            NestedField(field_id=4, name="val", field_type=DoubleType(), required=False),
+        ),
+    )
+    try:
+        records = []
+        for year in [2022, 2023]:
+            for month in [1, 2, 3]:
+                for i in range(5):
+                    records.append({"id": len(records) + 1, "year": year, "month": month, "val": float(i)})
+        for i in range(0, len(records), 10):
+            batch = pd.DataFrame(records[i : i + 10]).astype(
+                {"id": "int64", "year": "int32", "month": "int32", "val": "float64"}
+            )
+            table.append(pa.Table.from_pandas(batch))
+
+        tp = table.location()
+
+        df = spark.read.format("iceberg").load(tp).filter("year = 2023")
+        assert df.count() == 15
+
+        df = spark.read.format("iceberg").load(tp).filter("year = 2022 AND month = 2")
+        assert df.count() == 5
+
+        df = spark.read.format("iceberg").load(tp).filter("(year = 2022 AND month = 1) OR (year = 2023 AND month = 3)")
+        assert df.count() == 10
+    finally:
+        catalog.drop_table("default.prune_correct")

From 55acd5a9890b995a83c50b2a7edd053c08583598 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Tue, 14 Oct 2025 09:40:52 +0800
Subject: [PATCH 19/32] log level

---
 .../sail-iceberg/src/datasource/provider.rs   | 63 +++++++------------
 .../src/spec/metadata/table_metadata.rs       | 24 +++----
 crates/sail-iceberg/src/table_format.rs       | 26 +++-----
 3 files changed, 43 insertions(+), 70 deletions(-)

diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
index 4ba2718a94..e08795b063 100644
--- a/crates/sail-iceberg/src/datasource/provider.rs
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -57,15 +57,15 @@ impl IcebergTableProvider {
         partition_specs: Vec<PartitionSpec>,
     ) -> DataFusionResult<Self> {
         let table_uri_str = table_uri.to_string();
-        log::info!("[ICEBERG] Creating table provider for: {}", table_uri_str);
+        log::trace!("Creating table provider for: {}", table_uri_str);
 
         let arrow_schema = Arc::new(iceberg_schema_to_arrow(&schema).map_err(|e| {
-            log::error!("[ICEBERG] Failed to convert schema to Arrow: {:?}", e);
+            log::trace!("Failed to convert schema to Arrow: {:?}", e);
             e
         })?);
 
-        log::debug!(
-            "[ICEBERG] Converted schema to Arrow with {} fields",
+        log::trace!(
+            "Converted schema to Arrow with {} fields",
             arrow_schema.fields().len()
         );
 
@@ -114,13 +114,10 @@ impl IcebergTableProvider {
         object_store: &Arc<dyn object_store::ObjectStore>,
     ) -> DataFusionResult<ManifestList> {
         let manifest_list_str = self.snapshot.manifest_list();
-        log::debug!("[ICEBERG] Manifest list path: {}", manifest_list_str);
+        log::trace!("Manifest list path: {}", manifest_list_str);
 
         let manifest_list_path = if let Ok(url) = Url::parse(manifest_list_str) {
-            log::debug!(
-                "[ICEBERG] Parsed manifest list as URL, path: {}",
-                url.path()
-            );
+            log::trace!("Parsed manifest list as URL, path: {}", url.path());
             ObjectPath::from(url.path())
         } else {
             ObjectPath::from(manifest_list_str)
@@ -130,17 +127,14 @@ impl IcebergTableProvider {
             .get(&manifest_list_path)
             .await
             .map_err(|e| {
-                log::error!("[ICEBERG] Failed to get manifest list: {:?}", e);
+                log::trace!("Failed to get manifest list: {:?}", e);
                 datafusion::common::DataFusionError::External(Box::new(e))
             })?
             .bytes()
             .await
             .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
 
-        log::debug!(
-            "[ICEBERG] Read {} bytes from manifest list",
-            manifest_list_data.len()
-        );
+        log::trace!("Read {} bytes from manifest list", manifest_list_data.len());
 
         ManifestList::parse_with_version(&manifest_list_data, FormatVersion::V2)
             .map_err(datafusion::common::DataFusionError::Execution)
@@ -172,10 +166,10 @@ impl IcebergTableProvider {
             }
 
             let manifest_path_str = manifest_file.manifest_path.as_str();
-            log::debug!("[ICEBERG] Loading manifest: {}", manifest_path_str);
+            log::trace!("Loading manifest: {}", manifest_path_str);
 
             let manifest_path = if let Ok(url) = Url::parse(manifest_path_str) {
-                log::debug!("[ICEBERG] Parsed manifest as URL, path: {}", url.path());
+                log::trace!("Parsed manifest as URL, path: {}", url.path());
                 ObjectPath::from(url.path())
             } else {
                 ObjectPath::from(manifest_path_str)
@@ -185,14 +179,14 @@ impl IcebergTableProvider {
                 .get(&manifest_path)
                 .await
                 .map_err(|e| {
-                    log::error!("[ICEBERG] Failed to get manifest: {:?}", e);
+                    log::trace!("Failed to get manifest: {:?}", e);
                     datafusion::common::DataFusionError::External(Box::new(e))
                 })?
                 .bytes()
                 .await
                 .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
 
-            log::debug!("[ICEBERG] Read {} bytes from manifest", manifest_data.len());
+            log::trace!("Read {} bytes from manifest", manifest_data.len());
 
             let manifest = Manifest::parse_avro(&manifest_data)
                 .map_err(datafusion::common::DataFusionError::Execution)?;
@@ -251,7 +245,7 @@ impl IcebergTableProvider {
 
         for data_file in data_files {
             let file_path_str = data_file.file_path();
-            log::debug!("[ICEBERG] Processing data file: {}", file_path_str);
+            log::trace!("Processing data file: {}", file_path_str);
 
             let file_path = if let Ok(url) = Url::parse(file_path_str) {
                 ObjectPath::from(url.path())
@@ -264,7 +258,7 @@ impl IcebergTableProvider {
                 ))
             };
 
-            log::debug!("[ICEBERG] Final ObjectPath: {}", file_path);
+            log::trace!("Final ObjectPath: {}", file_path);
 
             let object_meta = ObjectMeta {
                 location: file_path,
@@ -526,26 +520,23 @@ impl TableProvider for IcebergTableProvider {
         _filters: &[Expr],
         limit: Option<usize>,
     ) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
-        log::info!("[ICEBERG] Starting scan for table: {}", self.table_uri);
+        log::trace!("Starting scan for table: {}", self.table_uri);
 
         let object_store = self.get_object_store(session)?;
-        log::debug!("[ICEBERG] Got object store");
+        log::trace!("Got object store");
 
-        log::info!(
-            "[ICEBERG] Loading manifest list from: {}",
+        log::trace!(
+            "Loading manifest list from: {}",
             self.snapshot.manifest_list()
         );
         let manifest_list = self.load_manifest_list(&object_store).await?;
-        log::info!(
-            "[ICEBERG] Loaded {} manifest files",
-            manifest_list.entries().len()
-        );
+        log::trace!("Loaded {} manifest files", manifest_list.entries().len());
 
-        log::info!("[ICEBERG] Loading data files from manifests...");
+        log::trace!("Loading data files from manifests...");
         let mut data_files = self
             .load_data_files(session, _filters, &object_store, &manifest_list)
             .await?;
-        log::info!("[ICEBERG] Loaded {} data files", data_files.len());
+        log::trace!("Loaded {} data files", data_files.len());
 
         // TODO: Manifest-level pruning using partition summaries to avoid loading all files
         // TODO: Partition-transform aware filtering before file-level metrics pruning
@@ -564,18 +555,12 @@ impl TableProvider for IcebergTableProvider {
             )?;
             _pruning_mask = mask;
             data_files = kept;
-            log::info!(
-                "[ICEBERG] Pruned data files, remaining: {}",
-                data_files.len()
-            );
+            log::trace!("Pruned data files, remaining: {}", data_files.len());
         }
 
-        log::info!("[ICEBERG] Creating partitioned files...");
+        log::trace!("Creating partitioned files...");
         let partitioned_files = self.create_partitioned_files(data_files.clone())?;
-        log::info!(
-            "[ICEBERG] Created {} partitioned files",
-            partitioned_files.len()
-        );
+        log::trace!("Created {} partitioned files", partitioned_files.len());
 
         // Step 4: Create file groups
         let file_groups = self.create_file_groups(partitioned_files);
diff --git a/crates/sail-iceberg/src/spec/metadata/table_metadata.rs b/crates/sail-iceberg/src/spec/metadata/table_metadata.rs
index cb79b3bbaf..f07bbffe23 100644
--- a/crates/sail-iceberg/src/spec/metadata/table_metadata.rs
+++ b/crates/sail-iceberg/src/spec/metadata/table_metadata.rs
@@ -144,37 +144,31 @@ impl TableMetadata {
     }
 
     pub fn from_json(data: &[u8]) -> Result<Self, serde_json::Error> {
-        log::debug!("[ICEBERG] Attempting to parse table metadata JSON");
+        log::trace!("Attempting to parse table metadata JSON");
 
         match serde_json::from_slice::<serde_json::Value>(data) {
             Ok(json_value) => {
                 if let Some(obj) = json_value.as_object() {
-                    log::debug!(
-                        "[ICEBERG] JSON fields present: {:?}",
-                        obj.keys().collect::<Vec<_>>()
-                    );
+                    log::trace!("JSON fields present: {:?}", obj.keys().collect::<Vec<_>>());
 
                     if let Some(refs) = obj.get("refs") {
-                        log::debug!("[ICEBERG] refs field: {:?}", refs);
+                        log::trace!("refs field: {:?}", refs);
                     }
                     if let Some(sort_orders) = obj.get("sort-orders") {
-                        log::debug!("[ICEBERG] sort-orders field: {:?}", sort_orders);
+                        log::trace!("sort-orders field: {:?}", sort_orders);
                     }
                     if let Some(stats) = obj.get("statistics") {
-                        log::debug!("[ICEBERG] statistics field: {:?}", stats);
+                        log::trace!("statistics field: {:?}", stats);
                     }
                     if let Some(partition_stats) = obj.get("partition-statistics") {
-                        log::debug!(
-                            "[ICEBERG] partition-statistics field: {:?}",
-                            partition_stats
-                        );
+                        log::trace!("partition-statistics field: {:?}", partition_stats);
                     }
                 }
 
-                log::debug!("[ICEBERG] Deserializing to TableMetadata struct");
+                log::trace!("Deserializing to TableMetadata struct");
                 serde_json::from_value::<TableMetadataEnum>(json_value)
                     .map_err(|e| {
-                        log::error!("[ICEBERG] Failed to deserialize TableMetadata: {:?}", e);
+                        log::trace!("Failed to deserialize TableMetadata: {:?}", e);
                         e
                     })
                     .map(|tm| match tm {
@@ -182,7 +176,7 @@ impl TableMetadata {
                     })
             }
             Err(e) => {
-                log::error!("[ICEBERG] Failed to parse as JSON: {:?}", e);
+                log::trace!("Failed to parse as JSON: {:?}", e);
                 Err(e)
             }
         }
diff --git a/crates/sail-iceberg/src/table_format.rs b/crates/sail-iceberg/src/table_format.rs
index c4da01bfe0..5c892215d1 100644
--- a/crates/sail-iceberg/src/table_format.rs
+++ b/crates/sail-iceberg/src/table_format.rs
@@ -34,16 +34,13 @@ impl TableFormat for IcebergTableFormat {
             options: _options,
         } = info;
 
-        log::info!("[ICEBERG] Creating table provider for paths: {:?}", paths);
+        log::trace!("Creating table provider for paths: {:?}", paths);
         let table_url = Self::parse_table_url(ctx, paths).await?;
-        log::info!("[ICEBERG] Parsed table URL: {}", table_url);
+        log::trace!("Parsed table URL: {}", table_url);
 
         let (iceberg_schema, snapshot, partition_specs) =
             load_table_metadata(ctx, &table_url).await?;
-        log::info!(
-            "[ICEBERG] Loaded metadata, snapshot_id: {}",
-            snapshot.snapshot_id()
-        );
+        log::trace!("Loaded metadata, snapshot_id: {}", snapshot.snapshot_id());
 
         let provider = IcebergTableProvider::new(
             table_url.to_string(),
@@ -95,7 +92,7 @@ async fn load_table_metadata(
     ctx: &dyn Session,
     table_url: &Url,
 ) -> Result<(Schema, Snapshot, Vec<PartitionSpec>)> {
-    log::debug!("[ICEBERG] Loading table metadata from: {}", table_url);
+    log::trace!("Loading table metadata from: {}", table_url);
     let object_store = ctx
         .runtime_env()
         .object_store_registry
@@ -103,7 +100,7 @@ async fn load_table_metadata(
         .map_err(|e| DataFusionError::External(Box::new(e)))?;
 
     let metadata_location = find_latest_metadata_file(&object_store, table_url).await?;
-    log::info!("[ICEBERG] Found metadata file: {}", metadata_location);
+    log::trace!("Found metadata file: {}", metadata_location);
 
     let metadata_path = object_store::path::Path::from(metadata_location.as_str());
     let metadata_data = object_store
@@ -114,13 +111,10 @@ async fn load_table_metadata(
         .await
         .map_err(|e| DataFusionError::External(Box::new(e)))?;
 
-    log::debug!(
-        "[ICEBERG] Read {} bytes from metadata file",
-        metadata_data.len()
-    );
+    log::trace!("Read {} bytes from metadata file", metadata_data.len());
 
     let table_metadata = TableMetadata::from_json(&metadata_data).map_err(|e| {
-        log::error!("[ICEBERG] Failed to parse table metadata: {:?}", e);
+        log::trace!("Failed to parse table metadata: {:?}", e);
         DataFusionError::External(Box::new(e))
     })?;
 
@@ -152,7 +146,7 @@ async fn find_latest_metadata_file(
     use futures::TryStreamExt;
     use object_store::path::Path as ObjectPath;
 
-    log::debug!("[ICEBERG] Finding latest metadata file");
+    log::trace!("Finding latest metadata file");
     let version_hint_path =
         ObjectPath::from(format!("{}metadata/version-hint.text", table_url.path()).as_str());
 
@@ -162,13 +156,13 @@ async fn find_latest_metadata_file(
                 let version = version_hint.trim().parse::<i32>().unwrap_or(0);
                 let metadata_file =
                     format!("{}/metadata/v{}.metadata.json", table_url.path(), version);
-                log::debug!("[ICEBERG] Using version hint: {}", version);
+                log::trace!("Using version hint: {}", version);
                 return Ok(metadata_file);
             }
         }
     }
 
-    log::debug!("[ICEBERG] No version hint, listing metadata directory");
+    log::trace!("No version hint, listing metadata directory");
     let metadata_prefix = ObjectPath::from(format!("{}metadata/", table_url.path()).as_str());
     let objects = object_store.list(Some(&metadata_prefix));
 

From 498c233669f887b53cfb7a095b1ead03e37833ed Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Tue, 14 Oct 2025 10:36:14 +0800
Subject: [PATCH 20/32] rewrite

---
 .../src/datasource/expr_adapter.rs            | 283 ++++++++++++++++++
 .../src/datasource/expressions.rs             |  17 +-
 crates/sail-iceberg/src/datasource/mod.rs     |   1 +
 .../sail-iceberg/src/datasource/provider.rs   |   3 +
 crates/sail-iceberg/src/datasource/pruning.rs |  25 +-
 .../spark/iceberg/test_iceberg_pruning.py     | 114 +++++++
 6 files changed, 438 insertions(+), 5 deletions(-)
 create mode 100644 crates/sail-iceberg/src/datasource/expr_adapter.rs

diff --git a/crates/sail-iceberg/src/datasource/expr_adapter.rs b/crates/sail-iceberg/src/datasource/expr_adapter.rs
new file mode 100644
index 0000000000..0423c749c3
--- /dev/null
+++ b/crates/sail-iceberg/src/datasource/expr_adapter.rs
@@ -0,0 +1,283 @@
+use std::sync::Arc;
+
+use datafusion::arrow::compute::can_cast_types;
+use datafusion::arrow::datatypes::{DataType, Field, FieldRef, Schema as ArrowSchema, SchemaRef};
+use datafusion::common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion::common::{exec_err, Result, ScalarValue};
+use datafusion::physical_expr::expressions::{Column, Literal};
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_expr_adapter::{PhysicalExprAdapter, PhysicalExprAdapterFactory};
+
+#[derive(Debug)]
+pub struct IcebergPhysicalExprAdapterFactory {}
+
+impl PhysicalExprAdapterFactory for IcebergPhysicalExprAdapterFactory {
+    fn create(
+        &self,
+        logical_file_schema: SchemaRef,
+        physical_file_schema: SchemaRef,
+    ) -> Arc<dyn PhysicalExprAdapter> {
+        let (column_mapping, default_values) =
+            create_column_mapping(&logical_file_schema, &physical_file_schema);
+
+        Arc::new(IcebergPhysicalExprAdapter {
+            logical_file_schema,
+            physical_file_schema,
+            partition_values: Vec::new(),
+            column_mapping,
+            default_values,
+        })
+    }
+}
+
+fn create_column_mapping(
+    logical_schema: &ArrowSchema,
+    physical_schema: &ArrowSchema,
+) -> (Vec<Option<usize>>, Vec<Option<ScalarValue>>) {
+    let mut column_mapping = Vec::with_capacity(logical_schema.fields().len());
+    let mut default_values = Vec::with_capacity(logical_schema.fields().len());
+
+    for logical_field in logical_schema.fields() {
+        match physical_schema.index_of(logical_field.name()) {
+            Ok(physical_index) => {
+                column_mapping.push(Some(physical_index));
+                default_values.push(None);
+            }
+            Err(_) => {
+                column_mapping.push(None);
+                let default_value = if logical_field.is_nullable() {
+                    Some(
+                        ScalarValue::try_from(logical_field.data_type())
+                            .unwrap_or(ScalarValue::Null),
+                    )
+                } else {
+                    Some(create_default_value(logical_field.data_type()))
+                };
+                default_values.push(default_value);
+            }
+        }
+    }
+
+    (column_mapping, default_values)
+}
+
+fn create_default_value(data_type: &DataType) -> ScalarValue {
+    match data_type {
+        DataType::Boolean => ScalarValue::Boolean(Some(false)),
+        DataType::Int8 => ScalarValue::Int8(Some(0)),
+        DataType::Int16 => ScalarValue::Int16(Some(0)),
+        DataType::Int32 => ScalarValue::Int32(Some(0)),
+        DataType::Int64 => ScalarValue::Int64(Some(0)),
+        DataType::UInt8 => ScalarValue::UInt8(Some(0)),
+        DataType::UInt16 => ScalarValue::UInt16(Some(0)),
+        DataType::UInt32 => ScalarValue::UInt32(Some(0)),
+        DataType::UInt64 => ScalarValue::UInt64(Some(0)),
+        DataType::Float16 => ScalarValue::Float32(Some(0.0)),
+        DataType::Float32 => ScalarValue::Float32(Some(0.0)),
+        DataType::Float64 => ScalarValue::Float64(Some(0.0)),
+        DataType::Utf8 => ScalarValue::Utf8(Some(String::new())),
+        DataType::LargeUtf8 => ScalarValue::LargeUtf8(Some(String::new())),
+        DataType::Binary => ScalarValue::Binary(Some(Vec::new())),
+        DataType::LargeBinary => ScalarValue::LargeBinary(Some(Vec::new())),
+        DataType::Date32 => ScalarValue::Date32(Some(0)),
+        DataType::Date64 => ScalarValue::Date64(Some(0)),
+        DataType::Time32(_) => ScalarValue::Time32Second(Some(0)),
+        DataType::Time64(_) => ScalarValue::Time64Nanosecond(Some(0)),
+        DataType::Timestamp(unit, tz) => match unit {
+            datafusion::arrow::datatypes::TimeUnit::Second => {
+                ScalarValue::TimestampSecond(Some(0), tz.clone())
+            }
+            datafusion::arrow::datatypes::TimeUnit::Millisecond => {
+                ScalarValue::TimestampMillisecond(Some(0), tz.clone())
+            }
+            datafusion::arrow::datatypes::TimeUnit::Microsecond => {
+                ScalarValue::TimestampMicrosecond(Some(0), tz.clone())
+            }
+            datafusion::arrow::datatypes::TimeUnit::Nanosecond => {
+                ScalarValue::TimestampNanosecond(Some(0), tz.clone())
+            }
+        },
+        _ => ScalarValue::Null,
+    }
+}
+
+#[derive(Debug)]
+struct IcebergPhysicalExprAdapter {
+    logical_file_schema: SchemaRef,
+    physical_file_schema: SchemaRef,
+    partition_values: Vec<(FieldRef, ScalarValue)>,
+    column_mapping: Vec<Option<usize>>,
+    default_values: Vec<Option<ScalarValue>>,
+}
+
+impl PhysicalExprAdapter for IcebergPhysicalExprAdapter {
+    fn rewrite(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
+        let rewriter = IcebergPhysicalExprRewriter {
+            logical_file_schema: &self.logical_file_schema,
+            physical_file_schema: &self.physical_file_schema,
+            partition_values: &self.partition_values,
+            column_mapping: &self.column_mapping,
+            default_values: &self.default_values,
+        };
+        expr.transform(|expr| rewriter.rewrite_expr(Arc::clone(&expr)))
+            .data()
+    }
+
+    fn with_partition_values(
+        &self,
+        partition_values: Vec<(FieldRef, ScalarValue)>,
+    ) -> Arc<dyn PhysicalExprAdapter> {
+        Arc::new(IcebergPhysicalExprAdapter {
+            logical_file_schema: Arc::clone(&self.logical_file_schema),
+            physical_file_schema: Arc::clone(&self.physical_file_schema),
+            partition_values,
+            column_mapping: self.column_mapping.clone(),
+            default_values: self.default_values.clone(),
+        })
+    }
+}
+
+impl Clone for IcebergPhysicalExprAdapter {
+    fn clone(&self) -> Self {
+        Self {
+            logical_file_schema: Arc::clone(&self.logical_file_schema),
+            physical_file_schema: Arc::clone(&self.physical_file_schema),
+            partition_values: self.partition_values.clone(),
+            column_mapping: self.column_mapping.clone(),
+            default_values: self.default_values.clone(),
+        }
+    }
+}
+
+struct IcebergPhysicalExprRewriter<'a> {
+    logical_file_schema: &'a ArrowSchema,
+    physical_file_schema: &'a ArrowSchema,
+    partition_values: &'a [(FieldRef, ScalarValue)],
+    column_mapping: &'a [Option<usize>],
+    default_values: &'a [Option<ScalarValue>],
+}
+
+impl<'a> IcebergPhysicalExprRewriter<'a> {
+    fn rewrite_expr(
+        &self,
+        expr: Arc<dyn PhysicalExpr>,
+    ) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+        if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+            return self.rewrite_column(Arc::clone(&expr), column);
+        }
+        Ok(Transformed::no(expr))
+    }
+
+    fn rewrite_column(
+        &self,
+        expr: Arc<dyn PhysicalExpr>,
+        column: &Column,
+    ) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+        if let Some(partition_value) = self.get_partition_value(column.name()) {
+            return Ok(Transformed::yes(Arc::new(Literal::new(partition_value))));
+        }
+
+        let logical_field_index = match self.logical_file_schema.index_of(column.name()) {
+            Ok(index) => index,
+            Err(_) => {
+                if let Ok(_physical_field) =
+                    self.physical_file_schema.field_with_name(column.name())
+                {
+                    return Ok(Transformed::no(expr));
+                } else {
+                    return exec_err!(
+                        "Column '{}' not found in either logical or physical schema",
+                        column.name()
+                    );
+                }
+            }
+        };
+
+        let logical_field = self.logical_file_schema.field(logical_field_index);
+
+        match self.column_mapping.get(logical_field_index) {
+            Some(Some(physical_index)) => {
+                let physical_field = self.physical_file_schema.field(*physical_index);
+                self.handle_existing_column(
+                    expr,
+                    column,
+                    logical_field,
+                    physical_field,
+                    *physical_index,
+                )
+            }
+            Some(None) => {
+                if let Some(Some(default_value)) = self.default_values.get(logical_field_index) {
+                    Ok(Transformed::yes(Arc::new(Literal::new(
+                        default_value.clone(),
+                    ))))
+                } else if logical_field.is_nullable() {
+                    let null_value = ScalarValue::Null.cast_to(logical_field.data_type())?;
+                    Ok(Transformed::yes(Arc::new(Literal::new(null_value))))
+                } else {
+                    exec_err!("Non-nullable column '{}' is missing from physical schema and no default value provided", column.name())
+                }
+            }
+            None => exec_err!(
+                "Column mapping not found for logical field index {}",
+                logical_field_index
+            ),
+        }
+    }
+
+    fn handle_existing_column(
+        &self,
+        expr: Arc<dyn PhysicalExpr>,
+        column: &Column,
+        logical_field: &Field,
+        physical_field: &Field,
+        physical_index: usize,
+    ) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+        let needs_index_update = column.index() != physical_index;
+        let needs_type_cast = logical_field.data_type() != physical_field.data_type();
+
+        match (needs_index_update, needs_type_cast) {
+            (false, false) => Ok(Transformed::no(expr)),
+            (true, false) => {
+                let new_column =
+                    Column::new_with_schema(logical_field.name(), self.physical_file_schema)?;
+                Ok(Transformed::yes(Arc::new(new_column)))
+            }
+            (false, true) => self.apply_type_cast(expr, logical_field, physical_field),
+            (true, true) => {
+                let new_column =
+                    Column::new_with_schema(logical_field.name(), self.physical_file_schema)?;
+                self.apply_type_cast(Arc::new(new_column), logical_field, physical_field)
+            }
+        }
+    }
+
+    fn apply_type_cast(
+        &self,
+        column_expr: Arc<dyn PhysicalExpr>,
+        logical_field: &Field,
+        physical_field: &Field,
+    ) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+        if !can_cast_types(physical_field.data_type(), logical_field.data_type()) {
+            return exec_err!(
+                "Cannot cast column '{}' from '{}' (physical) to '{}' (logical)",
+                logical_field.name(),
+                physical_field.data_type(),
+                logical_field.data_type()
+            );
+        }
+        let cast_expr = datafusion::physical_expr::expressions::CastExpr::new(
+            column_expr,
+            logical_field.data_type().clone(),
+            None,
+        );
+        Ok(Transformed::yes(Arc::new(cast_expr)))
+    }
+
+    fn get_partition_value(&self, column_name: &str) -> Option<ScalarValue> {
+        self.partition_values
+            .iter()
+            .find(|(field, _)| field.name() == column_name)
+            .map(|(_, value)| value.clone())
+    }
+}
diff --git a/crates/sail-iceberg/src/datasource/expressions.rs b/crates/sail-iceberg/src/datasource/expressions.rs
index 3e79d835e2..e342b86005 100644
--- a/crates/sail-iceberg/src/datasource/expressions.rs
+++ b/crates/sail-iceberg/src/datasource/expressions.rs
@@ -30,10 +30,21 @@ pub fn get_pushdown_filters(
     filter: &[&Expr],
     _partition_cols: &[String],
 ) -> Vec<TableProviderFilterPushDown> {
-    // Conservatively mark filters as Inexact for now; refine with partition-aware analysis later.
-    // TODO: Partition-aware
     filter
         .iter()
-        .map(|_| TableProviderFilterPushDown::Inexact)
+        .map(|expr| match expr {
+            Expr::BinaryExpr(be) => match be.op {
+                datafusion::logical_expr::Operator::Eq
+                | datafusion::logical_expr::Operator::Lt
+                | datafusion::logical_expr::Operator::LtEq
+                | datafusion::logical_expr::Operator::Gt
+                | datafusion::logical_expr::Operator::GtEq
+                | datafusion::logical_expr::Operator::And
+                | datafusion::logical_expr::Operator::Or => TableProviderFilterPushDown::Inexact,
+                _ => TableProviderFilterPushDown::Unsupported,
+            },
+            Expr::InList(_) => TableProviderFilterPushDown::Inexact,
+            _ => TableProviderFilterPushDown::Unsupported,
+        })
         .collect()
 }
diff --git a/crates/sail-iceberg/src/datasource/mod.rs b/crates/sail-iceberg/src/datasource/mod.rs
index 629fbff659..ce0fd04194 100644
--- a/crates/sail-iceberg/src/datasource/mod.rs
+++ b/crates/sail-iceberg/src/datasource/mod.rs
@@ -1,3 +1,4 @@
+pub mod expr_adapter;
 pub mod expressions;
 pub mod provider;
 pub mod pruning;
diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
index e08795b063..616d4d931f 100644
--- a/crates/sail-iceberg/src/datasource/provider.rs
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -13,6 +13,7 @@ use datafusion::common::{Result as DataFusionResult, ToDFSchema};
 use datafusion::config::TableParquetOptions;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::physical_plan::{FileGroup, FileScanConfigBuilder, ParquetSource};
+use datafusion::physical_expr_adapter::PhysicalExprAdapterFactory;
 use datafusion::datasource::{TableProvider, TableType};
 use datafusion::execution::object_store::ObjectStoreUrl;
 use datafusion::logical_expr::utils::conjunction;
@@ -25,6 +26,7 @@ use url::Url;
 
 use crate::arrow_conversion::iceberg_schema_to_arrow;
 use crate::datasource::expressions::simplify_expr;
+use crate::datasource::expr_adapter::IcebergPhysicalExprAdapterFactory;
 use crate::datasource::pruning::{prune_files, prune_manifests_by_partition_summaries};
 use crate::spec::types::values::{Literal, PrimitiveLiteral};
 use crate::spec::{
@@ -610,6 +612,7 @@ impl TableProvider for IcebergTableProvider {
                 .with_statistics(table_stats)
                 .with_projection(projection.cloned())
                 .with_limit(limit)
+                .with_expr_adapter(Some(Arc::new(IcebergPhysicalExprAdapterFactory {}) as Arc<dyn PhysicalExprAdapterFactory>))
                 .build();
 
         Ok(DataSourceExec::from_data_source(file_scan_config))
diff --git a/crates/sail-iceberg/src/datasource/pruning.rs b/crates/sail-iceberg/src/datasource/pruning.rs
index eed016c758..20b10e9c32 100644
--- a/crates/sail-iceberg/src/datasource/pruning.rs
+++ b/crates/sail-iceberg/src/datasource/pruning.rs
@@ -157,8 +157,29 @@ impl PruningStatistics for IcebergPruningStats {
         _column: &Column,
         _value: &std::collections::HashSet<datafusion::common::scalar::ScalarValue>,
     ) -> Option<BooleanArray> {
-        // TODO: Partition-aware contained pruning
-        None
+        // Basic contained() for equality/IN pruning using lower/upper bounds equality for strings and integers
+        // When both bounds are equal to the value, we can mark "contained" true; otherwise, unknown
+        let field_id = self.field_id_for(_column)?;
+        let mut result = Vec::with_capacity(self.files.len());
+        for f in &self.files {
+            let lower = f.lower_bounds().get(&field_id);
+            let upper = f.upper_bounds().get(&field_id);
+            if let (Some(lb), Some(ub)) = (lower, upper) {
+                let lb_sv = self.datum_to_scalar(lb);
+                let ub_sv = self.datum_to_scalar(ub);
+                let mut any_match = false;
+                for v in _value.iter() {
+                    if &lb_sv == v && &ub_sv == v {
+                        any_match = true;
+                        break;
+                    }
+                }
+                result.push(any_match);
+            } else {
+                result.push(false);
+            }
+        }
+        Some(BooleanArray::from(result))
     }
 }
 
diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py b/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py
index 827ac95ebf..6c8b56b76f 100644
--- a/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py
@@ -166,3 +166,117 @@ def test_correctness_small(spark, tmp_path):
         assert df.count() == 10
     finally:
         catalog.drop_table("default.prune_correct")
+
+
+def test_or_and_not_pruning(spark, tmp_path):
+    catalog = _create_catalog(tmp_path)
+    table = catalog.create_table(
+        identifier="default.prune_or_and_not",
+        schema=Schema(
+            NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+            NestedField(field_id=2, name="year", field_type=IntegerType(), required=False),
+            NestedField(field_id=3, name="region", field_type=StringType(), required=False),
+        ),
+    )
+    try:
+        records = []
+        for year in [2022, 2023, 2024]:
+            for region in ["US", "EU", "ASIA"]:
+                for i in range(5):
+                    records.append({"id": len(records) + 1, "year": year, "region": region})
+        for i in range(0, len(records), 15):
+            batch = pd.DataFrame(records[i : i + 15]).astype({"id": "int64", "year": "int32"})
+            table.append(pa.Table.from_pandas(batch))
+
+        tp = table.location()
+
+        df = spark.read.format("iceberg").load(tp).filter("year = 2022 OR year = 2024")
+        assert df.count() == 30
+
+        df = spark.read.format("iceberg").load(tp).filter("year = 2023 AND region != 'ASIA'")
+        assert df.count() == 10
+
+        df = spark.read.format("iceberg").load(tp).filter("NOT (year = 2023 AND region = 'US')")
+        assert df.count() == 40
+    finally:
+        catalog.drop_table("default.prune_or_and_not")
+
+
+def test_string_in_and_range_pruning(spark, tmp_path):
+    catalog = _create_catalog(tmp_path)
+    table = catalog.create_table(
+        identifier="default.prune_string_in_range",
+        schema=Schema(
+            NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+            NestedField(field_id=2, name="dept", field_type=StringType(), required=False),
+            NestedField(field_id=3, name="team", field_type=StringType(), required=False),
+        ),
+    )
+    try:
+        rows = [
+            {"id": 1, "dept": "engineering", "team": "backend"},
+            {"id": 2, "dept": "engineering", "team": "frontend"},
+            {"id": 3, "dept": "marketing", "team": "growth"},
+            {"id": 4, "dept": "sales", "team": "enterprise"},
+        ]
+        table.append(pa.Table.from_pandas(pd.DataFrame(rows)))
+
+        tp = table.location()
+
+        df = spark.read.format("iceberg").load(tp).filter("team IN ('backend','frontend')")
+        assert df.count() == 2
+
+        df = spark.read.format("iceberg").load(tp).filter("dept > 'engineering'")
+        assert df.count() == 2
+    finally:
+        catalog.drop_table("default.prune_string_in_range")
+
+
+def test_metrics_based_pruning_numeric(spark, tmp_path):
+    catalog = _create_catalog(tmp_path)
+    table = catalog.create_table(
+        identifier="default.prune_metrics_numeric",
+        schema=Schema(
+            NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+            NestedField(field_id=2, name="val", field_type=DoubleType(), required=False),
+        ),
+    )
+    try:
+        data = []
+        for chunk in range(4):
+            for i in range(10):
+                data.append({"id": chunk * 10 + i, "val": float(i)})
+        for i in range(0, len(data), 10):
+            table.append(pa.Table.from_pandas(pd.DataFrame(data[i : i + 10])))
+
+        tp = table.location()
+
+        df = spark.read.format("iceberg").load(tp).filter("val >= 8.0")
+        assert df.count() == 8
+
+        df = spark.read.format("iceberg").load(tp).filter("val BETWEEN 3.0 AND 4.0")
+        assert df.count() == 8
+    finally:
+        catalog.drop_table("default.prune_metrics_numeric")
+
+
+def test_limit_pushdown_behavior(spark, tmp_path):
+    catalog = _create_catalog(tmp_path)
+    table = catalog.create_table(
+        identifier="default.prune_limit",
+        schema=Schema(
+            NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+            NestedField(field_id=2, name="flag", field_type=BooleanType(), required=False),
+        ),
+    )
+    try:
+        rows = [{"id": i, "flag": i % 2 == 0} for i in range(100)]
+        for i in range(0, len(rows), 20):
+            table.append(pa.Table.from_pandas(pd.DataFrame(rows[i : i + 20]).astype({"id": "int64"})))
+
+        tp = table.location()
+
+        df = spark.read.format("iceberg").load(tp).filter("flag = true").limit(7)
+        assert df.count() == 7
+    finally:
+        catalog.drop_table("default.prune_limit")

From d4e0f5b732d6c89600ed168babb4736452dfa7ce Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Tue, 14 Oct 2025 10:42:57 +0800
Subject: [PATCH 21/32] pushdown

---
 .../sail-iceberg/src/datasource/provider.rs   | 175 ++++++++++++++++--
 1 file changed, 163 insertions(+), 12 deletions(-)

diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
index 616d4d931f..6bb0690bd0 100644
--- a/crates/sail-iceberg/src/datasource/provider.rs
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -13,20 +13,22 @@ use datafusion::common::{Result as DataFusionResult, ToDFSchema};
 use datafusion::config::TableParquetOptions;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::physical_plan::{FileGroup, FileScanConfigBuilder, ParquetSource};
-use datafusion::physical_expr_adapter::PhysicalExprAdapterFactory;
 use datafusion::datasource::{TableProvider, TableType};
 use datafusion::execution::object_store::ObjectStoreUrl;
 use datafusion::logical_expr::utils::conjunction;
-use datafusion::logical_expr::{Expr, LogicalPlan};
+use datafusion::logical_expr::{
+    BinaryExpr, Expr, LogicalPlan, Operator, TableProviderFilterPushDown,
+};
 use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_expr_adapter::PhysicalExprAdapterFactory;
 use datafusion::physical_plan::ExecutionPlan;
 use object_store::path::Path as ObjectPath;
 use object_store::ObjectMeta;
 use url::Url;
 
 use crate::arrow_conversion::iceberg_schema_to_arrow;
-use crate::datasource::expressions::simplify_expr;
 use crate::datasource::expr_adapter::IcebergPhysicalExprAdapterFactory;
+use crate::datasource::expressions::simplify_expr;
 use crate::datasource::pruning::{prune_files, prune_manifests_by_partition_summaries};
 use crate::spec::types::values::{Literal, PrimitiveLiteral};
 use crate::spec::{
@@ -519,7 +521,7 @@ impl TableProvider for IcebergTableProvider {
         &self,
         session: &dyn Session,
         projection: Option<&Vec<usize>>,
-        _filters: &[Expr],
+        filters: &[Expr],
         limit: Option<usize>,
     ) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
         log::trace!("Starting scan for table: {}", self.table_uri);
@@ -534,9 +536,12 @@ impl TableProvider for IcebergTableProvider {
         let manifest_list = self.load_manifest_list(&object_store).await?;
         log::trace!("Loaded {} manifest files", manifest_list.entries().len());
 
+        // Classify & split filters for pruning vs parquet pushdown
+        let (pruning_filters, parquet_pushdown_filters) = self.separate_filters(filters);
+
         log::trace!("Loading data files from manifests...");
         let mut data_files = self
-            .load_data_files(session, _filters, &object_store, &manifest_list)
+            .load_data_files(session, &pruning_filters, &object_store, &manifest_list)
             .await?;
         log::trace!("Loaded {} data files", data_files.len());
 
@@ -544,14 +549,14 @@ impl TableProvider for IcebergTableProvider {
         // TODO: Partition-transform aware filtering before file-level metrics pruning
 
         // Build filter conjunction and run DataFusion-based pruning on Iceberg metrics
-        let filter_expr = conjunction(_filters.iter().cloned());
+        let filter_expr = conjunction(pruning_filters.iter().cloned());
         let mut _pruning_mask: Option<Vec<bool>> = None;
         if filter_expr.is_some() || limit.is_some() {
             let (kept, mask) = prune_files(
                 session,
-                _filters,
+                &pruning_filters,
                 limit,
-                self.arrow_schema.clone(),
+                self.rebuild_logical_schema_for_filters(projection, filters),
                 data_files,
                 &self.schema,
             )?;
@@ -585,9 +590,11 @@ impl TableProvider for IcebergTableProvider {
 
         let mut parquet_source = ParquetSource::new(parquet_options);
         // Prepare pushdown filter for Parquet
-        let pushdown_filter: Option<Arc<dyn PhysicalExpr>> = if !_filters.is_empty() {
-            let df_schema = self.arrow_schema.clone().to_dfschema()?;
-            let pushdown_expr = conjunction(_filters.iter().cloned());
+        let pushdown_filter: Option<Arc<dyn PhysicalExpr>> = if !parquet_pushdown_filters.is_empty()
+        {
+            let logical_schema = self.rebuild_logical_schema_for_filters(projection, filters);
+            let df_schema = logical_schema.to_dfschema()?;
+            let pushdown_expr = conjunction(parquet_pushdown_filters);
             pushdown_expr.map(|expr| simplify_expr(session, &df_schema, expr))
         } else {
             None
@@ -612,9 +619,153 @@ impl TableProvider for IcebergTableProvider {
                 .with_statistics(table_stats)
                 .with_projection(projection.cloned())
                 .with_limit(limit)
-                .with_expr_adapter(Some(Arc::new(IcebergPhysicalExprAdapterFactory {}) as Arc<dyn PhysicalExprAdapterFactory>))
+                .with_expr_adapter(Some(Arc::new(IcebergPhysicalExprAdapterFactory {})
+                    as Arc<dyn PhysicalExprAdapterFactory>))
                 .build();
 
         Ok(DataSourceExec::from_data_source(file_scan_config))
     }
+
+    fn supports_filters_pushdown(
+        &self,
+        filter: &[&Expr],
+    ) -> DataFusionResult<Vec<TableProviderFilterPushDown>> {
+        Ok(filter
+            .iter()
+            .map(|e| self.classify_pushdown_for_expr(e))
+            .collect())
+    }
+}
+
+impl IcebergTableProvider {
+    fn classify_pushdown_for_expr(&self, expr: &Expr) -> TableProviderFilterPushDown {
+        use TableProviderFilterPushDown as FP;
+        // Identity partition columns get Exact (Eq/IN) or Inexact (ranges)
+        match expr {
+            Expr::BinaryExpr(BinaryExpr { left, op, right }) => {
+                let (l, r) = (Self::strip_expr(left), Self::strip_expr(right));
+                match op {
+                    Operator::Eq => {
+                        if let (Some(col), true) =
+                            (self.expr_as_column_name(l), self.expr_is_literal(r))
+                        {
+                            if self.is_identity_partition_col(&col) {
+                                return FP::Exact;
+                            }
+                        }
+                        if let (Some(col), true) =
+                            (self.expr_as_column_name(r), self.expr_is_literal(l))
+                        {
+                            if self.is_identity_partition_col(&col) {
+                                return FP::Exact;
+                            }
+                        }
+                        FP::Unsupported
+                    }
+                    Operator::Gt | Operator::GtEq | Operator::Lt | Operator::LtEq => {
+                        if let Some(col) = self.expr_as_column_name(l) {
+                            if self.expr_is_literal(r) && self.is_identity_partition_col(&col) {
+                                return FP::Inexact;
+                            }
+                        }
+                        FP::Unsupported
+                    }
+                    _ => FP::Unsupported,
+                }
+            }
+            Expr::InList(in_list) if !in_list.negated => {
+                let e = Self::strip_expr(&in_list.expr);
+                if let Some(col) = self.expr_as_column_name(e) {
+                    let all_literals = in_list.list.iter().all(|it| self.expr_is_literal(it));
+                    if all_literals && self.is_identity_partition_col(&col) {
+                        TableProviderFilterPushDown::Exact
+                    } else {
+                        TableProviderFilterPushDown::Unsupported
+                    }
+                } else {
+                    TableProviderFilterPushDown::Unsupported
+                }
+            }
+            _ => TableProviderFilterPushDown::Unsupported,
+        }
+    }
+
+    fn separate_filters(&self, filters: &[Expr]) -> (Vec<Expr>, Vec<Expr>) {
+        let mut pruning_filters = Vec::new();
+        let mut parquet_pushdown_filters = Vec::new();
+        for f in filters.iter() {
+            match self.classify_pushdown_for_expr(f) {
+                TableProviderFilterPushDown::Exact => {
+                    pruning_filters.push(f.clone());
+                }
+                TableProviderFilterPushDown::Inexact => {
+                    pruning_filters.push(f.clone());
+                    parquet_pushdown_filters.push(f.clone());
+                }
+                TableProviderFilterPushDown::Unsupported => {}
+            }
+        }
+        (pruning_filters, parquet_pushdown_filters)
+    }
+
+    fn strip_expr(expr: &Expr) -> &Expr {
+        match expr {
+            Expr::Cast(c) => Self::strip_expr(&c.expr),
+            Expr::Alias(a) => Self::strip_expr(&a.expr),
+            _ => expr,
+        }
+    }
+
+    fn expr_as_column_name(&self, expr: &Expr) -> Option<String> {
+        if let Expr::Column(c) = expr {
+            return Some(c.name.clone());
+        }
+        None
+    }
+
+    fn expr_is_literal(&self, expr: &Expr) -> bool {
+        matches!(expr, Expr::Literal(_, _))
+    }
+
+    fn is_identity_partition_col(&self, col_name: &str) -> bool {
+        // Map identity partition source_id to schema field names
+        let mut names = std::collections::HashSet::new();
+        for spec in &self.partition_specs {
+            for pf in spec.fields().iter() {
+                if matches!(pf.transform, crate::spec::transform::Transform::Identity) {
+                    if let Some(field) = self.schema.field_by_id(pf.source_id) {
+                        names.insert(field.name.clone());
+                    }
+                }
+            }
+        }
+        names.contains(col_name)
+    }
+
+    fn rebuild_logical_schema_for_filters(
+        &self,
+        projection: Option<&Vec<usize>>,
+        filters: &[Expr],
+    ) -> Arc<ArrowSchema> {
+        if let Some(used) = projection {
+            let mut fields: Vec<arrow_schema::FieldRef> = Vec::new();
+            for idx in used {
+                fields.push(Arc::new(self.arrow_schema.field(*idx).clone()));
+            }
+            if let Some(expr) = conjunction(filters.iter().cloned()) {
+                for c in expr.column_refs() {
+                    if let Ok(idx) = self.arrow_schema.index_of(c.name.as_str()) {
+                        if !used.contains(&idx)
+                            && !fields.iter().any(|f| f.name() == c.name.as_str())
+                        {
+                            fields.push(Arc::new(self.arrow_schema.field(idx).clone()));
+                        }
+                    }
+                }
+            }
+            Arc::new(ArrowSchema::new(fields))
+        } else {
+            self.arrow_schema.clone()
+        }
+    }
 }

From 6c3dfccbc4c3960e79ea98190f2b968f40cc86d1 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Tue, 14 Oct 2025 11:16:00 +0800
Subject: [PATCH 22/32] most todos

---
 .../sail-iceberg/src/datasource/provider.rs   |   4 -
 crates/sail-iceberg/src/datasource/pruning.rs |  24 +-
 crates/sail-iceberg/src/spec/transform.rs     |  71 ++-
 crates/sail-iceberg/src/spec/types/values.rs  | 405 +++++++++++++++++-
 4 files changed, 469 insertions(+), 35 deletions(-)

diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
index 6bb0690bd0..b240d23905 100644
--- a/crates/sail-iceberg/src/datasource/provider.rs
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -545,9 +545,6 @@ impl TableProvider for IcebergTableProvider {
             .await?;
         log::trace!("Loaded {} data files", data_files.len());
 
-        // TODO: Manifest-level pruning using partition summaries to avoid loading all files
-        // TODO: Partition-transform aware filtering before file-level metrics pruning
-
         // Build filter conjunction and run DataFusion-based pruning on Iceberg metrics
         let filter_expr = conjunction(pruning_filters.iter().cloned());
         let mut _pruning_mask: Option<Vec<bool>> = None;
@@ -606,7 +603,6 @@ impl TableProvider for IcebergTableProvider {
         let parquet_source = Arc::new(parquet_source);
 
         // Build table statistics from pruned files
-        // TODO: Include partition-level stats and handle unknowns conservatively
         let table_stats = self.aggregate_statistics(&data_files);
 
         let file_scan_config =
diff --git a/crates/sail-iceberg/src/datasource/pruning.rs b/crates/sail-iceberg/src/datasource/pruning.rs
index 20b10e9c32..f9f6a373d4 100644
--- a/crates/sail-iceberg/src/datasource/pruning.rs
+++ b/crates/sail-iceberg/src/datasource/pruning.rs
@@ -289,8 +289,16 @@ pub fn prune_manifests_by_partition_summaries<'a>(
                                 if lt_prim(lit, lb) || gt_prim(lit, ub) {
                                     return false;
                                 }
-                            } else {
-                                // TODO: If only one bound is present, use it for pruning when safe
+                            } else if let Some(lb) = lower.as_ref() {
+                                // if we have only a lower bound, drop manifest if lit < lb
+                                if lt_prim(lit, lb) {
+                                    return false;
+                                }
+                            } else if let Some(ub) = upper.as_ref() {
+                                // if we have only an upper bound, drop manifest if lit > ub
+                                if gt_prim(lit, ub) {
+                                    return false;
+                                }
                             }
                         }
                     }
@@ -325,6 +333,18 @@ pub fn prune_manifests_by_partition_summaries<'a>(
                                 if !any_in {
                                     return false;
                                 }
+                            } else if let Some(lb) = lower.as_ref() {
+                                // with only lower bound, require any value >= lb
+                                let any_in = lits.iter().any(|v| !lt_prim(v, lb));
+                                if !any_in {
+                                    return false;
+                                }
+                            } else if let Some(ub) = upper.as_ref() {
+                                // with only upper bound, require any value <= ub
+                                let any_in = lits.iter().any(|v| !gt_prim(v, ub));
+                                if !any_in {
+                                    return false;
+                                }
                             }
                         }
                     }
diff --git a/crates/sail-iceberg/src/spec/transform.rs b/crates/sail-iceberg/src/spec/transform.rs
index b6ffad9a51..49d4a1b165 100644
--- a/crates/sail-iceberg/src/spec/transform.rs
+++ b/crates/sail-iceberg/src/spec/transform.rs
@@ -50,16 +50,69 @@ pub enum Transform {
 }
 
 impl Transform {
-    // TODO: Full value transformation support
-    pub fn to_human_string(self, _field_type: &Type, value: Option<&Literal>) -> String {
-        if let Some(_value) = value {
-            match self {
-                Self::Identity => "identity_value".to_string(),
-                Self::Void => "null".to_string(),
-                _ => "transformed_value".to_string(),
+    pub fn to_human_string(self, field_type: &Type, value: Option<&Literal>) -> String {
+        fn bytes_to_hex(bytes: &[u8]) -> String {
+            let mut s = String::with_capacity(bytes.len() * 2);
+            for b in bytes {
+                use std::fmt::Write as _;
+                let _ = write!(&mut s, "{:02x}", b);
             }
-        } else {
-            "null".to_string()
+            s
+        }
+
+        fn lit_str(l: &Literal) -> String {
+            match l {
+                Literal::Primitive(p) => match p {
+                    super::types::values::PrimitiveLiteral::Boolean(v) => v.to_string(),
+                    super::types::values::PrimitiveLiteral::Int(v) => v.to_string(),
+                    super::types::values::PrimitiveLiteral::Long(v) => v.to_string(),
+                    super::types::values::PrimitiveLiteral::Float(v) => v.0.to_string(),
+                    super::types::values::PrimitiveLiteral::Double(v) => v.0.to_string(),
+                    super::types::values::PrimitiveLiteral::Int128(v) => v.to_string(),
+                    super::types::values::PrimitiveLiteral::String(v) => v.clone(),
+                    super::types::values::PrimitiveLiteral::UInt128(v) => v.to_string(),
+                    super::types::values::PrimitiveLiteral::Binary(b) => {
+                        format!("0x{}", bytes_to_hex(b))
+                    }
+                },
+                Literal::Struct(_) | Literal::List(_) | Literal::Map(_) => format!("{:?}", l),
+            }
+        }
+
+        match value {
+            None => "null".to_string(),
+            Some(val) => match self {
+                Transform::Identity => lit_str(val),
+                Transform::Void => "null".to_string(),
+                Transform::Truncate(w) => match (field_type, val) {
+                    (
+                        Type::Primitive(PrimitiveType::String),
+                        Literal::Primitive(super::types::values::PrimitiveLiteral::String(s)),
+                    ) => s.chars().take(w as usize).collect::<String>(),
+                    (
+                        Type::Primitive(PrimitiveType::Int),
+                        Literal::Primitive(super::types::values::PrimitiveLiteral::Int(v)),
+                    ) => {
+                        let w = w as i32;
+                        let rem = v.rem_euclid(w);
+                        (v - rem).to_string()
+                    }
+                    (
+                        Type::Primitive(PrimitiveType::Long),
+                        Literal::Primitive(super::types::values::PrimitiveLiteral::Long(v)),
+                    ) => {
+                        let w = w as i64;
+                        let rem = v.rem_euclid(w);
+                        (v - rem).to_string()
+                    }
+                    _ => lit_str(val),
+                },
+                Transform::Bucket(n) => format!("bucket[{n}]({})", lit_str(val)),
+                Transform::Year | Transform::Month | Transform::Day | Transform::Hour => {
+                    lit_str(val)
+                }
+                Transform::Unknown => lit_str(val),
+            },
         }
     }
 
diff --git a/crates/sail-iceberg/src/spec/types/values.rs b/crates/sail-iceberg/src/spec/types/values.rs
index 3386abf1f1..b0249e426f 100644
--- a/crates/sail-iceberg/src/spec/types/values.rs
+++ b/crates/sail-iceberg/src/spec/types/values.rs
@@ -63,33 +63,398 @@ impl Datum {
 }
 
 impl Literal {
-    // TODO: Type-aware JSON conversion
     pub fn try_from_json(
         value: JsonValue,
-        _data_type: &crate::spec::types::Type,
+        data_type: &crate::spec::types::Type,
     ) -> Result<Option<Self>, String> {
-        match value {
-            JsonValue::Null => Ok(None),
-            _ => Ok(Some(Literal::Primitive(PrimitiveLiteral::String(
-                value.to_string(),
-            )))),
+        use crate::spec::types::PrimitiveType;
+        use crate::spec::types::Type;
+        use chrono::{NaiveDate, NaiveTime, Timelike};
+        use serde_json::Number;
+
+        fn number_to_i32(n: &Number) -> Result<i32, String> {
+            n.as_i64()
+                .ok_or_else(|| "Failed to convert json number to i32".to_string())
+                .and_then(|v| i32::try_from(v).map_err(|e| e.to_string()))
+        }
+        fn number_to_i64(n: &Number) -> Result<i64, String> {
+            n.as_i64()
+                .ok_or_else(|| "Failed to convert json number to i64".to_string())
+        }
+        fn number_to_f32(n: &Number) -> Result<f32, String> {
+            n.as_f64()
+                .ok_or_else(|| "Failed to convert json number to f32".to_string())
+                .map(|v| v as f32)
+        }
+        fn number_to_f64(n: &Number) -> Result<f64, String> {
+            n.as_f64()
+                .ok_or_else(|| "Failed to convert json number to f64".to_string())
+        }
+
+        fn parse_date_to_days(s: &str) -> Result<i32, String> {
+            let d = NaiveDate::parse_from_str(s, "%Y-%m-%d")
+                .map_err(|e| format!("Invalid date format: {}", e))?;
+            let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).ok_or("Bad epoch")?;
+            Ok((d - epoch).num_days() as i32)
+        }
+        fn parse_time_to_micros(s: &str) -> Result<i64, String> {
+            // Accept up to nanosecond precision, truncate to microseconds
+            let fmt_candidates = ["%H:%M:%S", "%H:%M:%S%.f"];
+            let mut last_err: Option<String> = None;
+            for fmt in &fmt_candidates {
+                match NaiveTime::parse_from_str(s, fmt) {
+                    Ok(t) => {
+                        let nanos = t.num_seconds_from_midnight() as i64 * 1_000_000_000
+                            + (t.nanosecond() as i64);
+                        return Ok(nanos / 1_000);
+                    }
+                    Err(e) => last_err = Some(e.to_string()),
+                }
+            }
+            Err(last_err.unwrap_or_else(|| "Invalid time".to_string()))
+        }
+        fn parse_ts_to_micros(s: &str) -> Result<i64, String> {
+            // Accept naive timestamp like 2020-01-01T12:34:56[.ffffff]
+            let fmt_candidates = [
+                "%Y-%m-%dT%H:%M:%S",
+                "%Y-%m-%dT%H:%M:%S%.f",
+                "%Y-%m-%d %H:%M:%S",
+                "%Y-%m-%d %H:%M:%S%.f",
+            ];
+            let mut last_err: Option<String> = None;
+            for fmt in &fmt_candidates {
+                match chrono::NaiveDateTime::parse_from_str(s, fmt) {
+                    Ok(dt) => {
+                        let epoch = NaiveDate::from_ymd_opt(1970, 1, 1)
+                            .ok_or("Bad epoch")?
+                            .and_hms_nano_opt(0, 0, 0, 0)
+                            .ok_or("Bad epoch")?;
+                        let micros = (dt - epoch).num_microseconds().ok_or("overflow")?;
+                        return Ok(micros);
+                    }
+                    Err(e) => last_err = Some(e.to_string()),
+                }
+            }
+            // Try with timezone (treated as UTC)
+            match chrono::DateTime::parse_from_rfc3339(s)
+                .or_else(|_| chrono::DateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.f%:z"))
+            {
+                Ok(dt) => Ok(dt.timestamp_micros()),
+                Err(_) => Err(last_err.unwrap_or_else(|| "Invalid timestamp".to_string())),
+            }
+        }
+
+        fn parse_uuid_to_u128(s: &str) -> Result<u128, String> {
+            let u = uuid::Uuid::parse_str(s).map_err(|e| e.to_string())?;
+            let bytes = u.as_bytes();
+            let mut acc: u128 = 0;
+            for b in bytes.iter() {
+                acc = (acc << 8) | (*b as u128);
+            }
+            Ok(acc)
+        }
+
+        fn parse_decimal_to_i128(s: &str, scale: u32) -> Result<i128, String> {
+            let s = s.trim();
+            if s.is_empty() {
+                return Err("empty decimal".to_string());
+            }
+            let negative = s.starts_with('-');
+            let s = if negative || s.starts_with('+') {
+                &s[1..]
+            } else {
+                s
+            };
+            let mut int_part: i128 = 0;
+            let mut frac_part: i128 = 0;
+            let mut frac_len: u32 = 0;
+            let mut seen_dot = false;
+            for ch in s.chars() {
+                if ch == '.' {
+                    if seen_dot {
+                        return Err("multiple decimal points".to_string());
+                    }
+                    seen_dot = true;
+                    continue;
+                }
+                if !ch.is_ascii_digit() {
+                    return Err("invalid decimal".to_string());
+                }
+                let d = (ch as u8 - b'0') as i128;
+                if !seen_dot {
+                    int_part = int_part
+                        .checked_mul(10)
+                        .and_then(|v| v.checked_add(d))
+                        .ok_or("overflow")?;
+                } else if frac_len < scale {
+                    frac_part = frac_part
+                        .checked_mul(10)
+                        .and_then(|v| v.checked_add(d))
+                        .ok_or("overflow")?;
+                    frac_len += 1;
+                } else {
+                    // truncate extra fractional digits beyond scale (rounding not applied)
+                }
+            }
+            let pow10 = 10i128.pow(scale);
+            let scaled = int_part
+                .checked_mul(pow10)
+                .and_then(|v| v.checked_add(frac_part * 10i128.pow(scale - frac_len)))
+                .ok_or("overflow")?;
+            Ok(if negative { -scaled } else { scaled })
         }
+
+        Ok(match (data_type, value) {
+            (_, JsonValue::Null) => None,
+            (Type::Primitive(PrimitiveType::Boolean), JsonValue::Bool(v)) => {
+                Some(Literal::Primitive(PrimitiveLiteral::Boolean(v)))
+            }
+            (Type::Primitive(PrimitiveType::Int), JsonValue::Number(n)) => Some(
+                Literal::Primitive(PrimitiveLiteral::Int(number_to_i32(&n)?)),
+            ),
+            (Type::Primitive(PrimitiveType::Long), JsonValue::Number(n)) => Some(
+                Literal::Primitive(PrimitiveLiteral::Long(number_to_i64(&n)?)),
+            ),
+            (Type::Primitive(PrimitiveType::Float), JsonValue::Number(n)) => Some(
+                Literal::Primitive(PrimitiveLiteral::Float(OrderedFloat(number_to_f32(&n)?))),
+            ),
+            (Type::Primitive(PrimitiveType::Double), JsonValue::Number(n)) => Some(
+                Literal::Primitive(PrimitiveLiteral::Double(OrderedFloat(number_to_f64(&n)?))),
+            ),
+            (Type::Primitive(PrimitiveType::Date), JsonValue::String(s)) => Some(
+                Literal::Primitive(PrimitiveLiteral::Int(parse_date_to_days(&s)?)),
+            ),
+            (Type::Primitive(PrimitiveType::Time), JsonValue::String(s)) => Some(
+                Literal::Primitive(PrimitiveLiteral::Long(parse_time_to_micros(&s)?)),
+            ),
+            (Type::Primitive(PrimitiveType::Timestamp), JsonValue::String(s)) => Some(
+                Literal::Primitive(PrimitiveLiteral::Long(parse_ts_to_micros(&s)?)),
+            ),
+            (Type::Primitive(PrimitiveType::Timestamptz), JsonValue::String(s)) => Some(
+                Literal::Primitive(PrimitiveLiteral::Long(parse_ts_to_micros(&s)?)),
+            ),
+            (Type::Primitive(PrimitiveType::TimestampNs), JsonValue::String(s)) => Some(
+                Literal::Primitive(PrimitiveLiteral::Long(parse_ts_to_micros(&s)? * 1000)),
+            ),
+            (Type::Primitive(PrimitiveType::TimestamptzNs), JsonValue::String(s)) => Some(
+                Literal::Primitive(PrimitiveLiteral::Long(parse_ts_to_micros(&s)? * 1000)),
+            ),
+            (Type::Primitive(PrimitiveType::String), JsonValue::String(s)) => {
+                Some(Literal::Primitive(PrimitiveLiteral::String(s)))
+            }
+            (Type::Primitive(PrimitiveType::Uuid), JsonValue::String(s)) => Some(
+                Literal::Primitive(PrimitiveLiteral::UInt128(parse_uuid_to_u128(&s)?)),
+            ),
+            (Type::Primitive(PrimitiveType::Binary), JsonValue::String(s)) => {
+                Some(Literal::Primitive(PrimitiveLiteral::Binary(s.into_bytes())))
+            }
+            (Type::Primitive(PrimitiveType::Fixed(_)), JsonValue::String(s)) => {
+                Some(Literal::Primitive(PrimitiveLiteral::Binary(s.into_bytes())))
+            }
+            (Type::Primitive(PrimitiveType::Decimal { scale, .. }), JsonValue::String(s)) => Some(
+                Literal::Primitive(PrimitiveLiteral::Int128(parse_decimal_to_i128(&s, *scale)?)),
+            ),
+            (Type::Struct(struct_ty), JsonValue::Object(mut map)) => {
+                let mut out = Vec::with_capacity(struct_ty.fields().len());
+                for field in struct_ty.fields() {
+                    let key = field.id.to_string();
+                    let v = map.remove(&key);
+                    let val = match v {
+                        Some(json) => Literal::try_from_json(json, &field.field_type)
+                            .and_then(|opt| {
+                                opt.ok_or_else(|| "Key of map cannot be null".to_string())
+                            })
+                            .ok(),
+                        None => None,
+                    };
+                    out.push((key, val));
+                }
+                Some(Literal::Struct(out))
+            }
+            (Type::List(list_ty), JsonValue::Array(arr)) => {
+                let mut out = Vec::with_capacity(arr.len());
+                for item in arr.into_iter() {
+                    let elem = Literal::try_from_json(item, &list_ty.element_field.field_type)?;
+                    out.push(elem);
+                }
+                Some(Literal::List(out))
+            }
+            (Type::Map(map_ty), JsonValue::Object(mut obj)) => {
+                let keys = obj.remove("keys").unwrap_or(JsonValue::Array(vec![]));
+                let vals = obj.remove("values").unwrap_or(JsonValue::Array(vec![]));
+                let (JsonValue::Array(keys), JsonValue::Array(vals)) = (keys, vals) else {
+                    return Err("Invalid map JSON".to_string());
+                };
+                if keys.len() != vals.len() {
+                    return Err("Keys and values length mismatch".to_string());
+                }
+                let mut out = Vec::with_capacity(keys.len());
+                for (k, v) in keys.into_iter().zip(vals.into_iter()) {
+                    let key = Literal::try_from_json(k, &map_ty.key_field.field_type)
+                        .and_then(|opt| opt.ok_or_else(|| "Map key cannot be null".to_string()))?;
+                    let val = Literal::try_from_json(v, &map_ty.value_field.field_type)?;
+                    out.push((key, val));
+                }
+                Some(Literal::Map(out))
+            }
+            // Fallback: store as string for unsupported combinations
+            (_, other) => Some(Literal::Primitive(PrimitiveLiteral::String(
+                other.to_string(),
+            ))),
+        })
     }
 
-    // TODO: Type-aware JSON conversion
-    pub fn try_into_json(
-        &self,
-        _data_type: &crate::spec::types::Type,
-    ) -> Result<JsonValue, String> {
-        match self {
-            Literal::Primitive(p) => match p {
-                PrimitiveLiteral::Boolean(v) => Ok(JsonValue::Bool(*v)),
-                PrimitiveLiteral::Int(v) => Ok(JsonValue::Number((*v).into())),
-                PrimitiveLiteral::Long(v) => Ok(JsonValue::Number((*v).into())),
-                PrimitiveLiteral::String(v) => Ok(JsonValue::String(v.clone())),
-                _ => Ok(JsonValue::String(format!("{:?}", p))),
+    pub fn try_into_json(&self, data_type: &crate::spec::types::Type) -> Result<JsonValue, String> {
+        use crate::spec::types::PrimitiveType;
+        use crate::spec::types::Type;
+        use chrono::{NaiveDate, NaiveTime};
+        use serde_json::Number;
+
+        fn days_to_date_str(days: i32) -> String {
+            let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap();
+            let d = epoch + chrono::Days::new(days as u64);
+            d.to_string()
+        }
+        fn micros_to_time_str(us: i64) -> String {
+            let secs = us.div_euclid(1_000_000);
+            let rem = (us.rem_euclid(1_000_000)) as u32;
+            let t = NaiveTime::from_num_seconds_from_midnight_opt(secs as u32, rem * 1000)
+                .unwrap_or(NaiveTime::from_hms_opt(0, 0, 0).unwrap());
+            t.format("%H:%M:%S%.f").to_string()
+        }
+        fn micros_to_datetime_str(us: i64) -> String {
+            let secs = us.div_euclid(1_000_000);
+            let rem = (us.rem_euclid(1_000_000)) as u32;
+            let base = NaiveDate::from_ymd_opt(1970, 1, 1)
+                .unwrap()
+                .and_hms_nano_opt(0, 0, 0, 0)
+                .unwrap();
+            let dt = base
+                .checked_add_signed(chrono::Duration::seconds(secs))
+                .and_then(|d| {
+                    d.checked_add_signed(chrono::Duration::nanoseconds((rem as i64) * 1000))
+                })
+                .unwrap_or(base);
+            dt.format("%Y-%m-%dT%H:%M:%S%.f").to_string()
+        }
+
+        match (self, data_type) {
+            (Literal::Primitive(prim), Type::Primitive(prim_ty)) => match (prim_ty, prim) {
+                (PrimitiveType::Boolean, PrimitiveLiteral::Boolean(v)) => Ok(JsonValue::Bool(*v)),
+                (PrimitiveType::Int, PrimitiveLiteral::Int(v)) => {
+                    Ok(JsonValue::Number((*v).into()))
+                }
+                (PrimitiveType::Long, PrimitiveLiteral::Long(v)) => {
+                    Ok(JsonValue::Number((*v).into()))
+                }
+                (PrimitiveType::Float, PrimitiveLiteral::Float(v)) => Number::from_f64(v.0 as f64)
+                    .map(JsonValue::Number)
+                    .ok_or_else(|| "Invalid float".to_string()),
+                (PrimitiveType::Double, PrimitiveLiteral::Double(v)) => Number::from_f64(v.0)
+                    .map(JsonValue::Number)
+                    .ok_or_else(|| "Invalid double".to_string()),
+                (PrimitiveType::Date, PrimitiveLiteral::Int(v)) => {
+                    Ok(JsonValue::String(days_to_date_str(*v)))
+                }
+                (PrimitiveType::Time, PrimitiveLiteral::Long(v)) => {
+                    Ok(JsonValue::String(micros_to_time_str(*v)))
+                }
+                (PrimitiveType::Timestamp, PrimitiveLiteral::Long(v)) => {
+                    Ok(JsonValue::String(micros_to_datetime_str(*v)))
+                }
+                (PrimitiveType::Timestamptz, PrimitiveLiteral::Long(v)) => {
+                    Ok(JsonValue::String(micros_to_datetime_str(*v)))
+                }
+                (PrimitiveType::TimestampNs, PrimitiveLiteral::Long(v)) => {
+                    Ok(JsonValue::String(micros_to_datetime_str(*v / 1000)))
+                }
+                (PrimitiveType::TimestamptzNs, PrimitiveLiteral::Long(v)) => {
+                    Ok(JsonValue::String(micros_to_datetime_str(*v / 1000)))
+                }
+                (PrimitiveType::String, PrimitiveLiteral::String(s)) => {
+                    Ok(JsonValue::String(s.clone()))
+                }
+                (PrimitiveType::Uuid, PrimitiveLiteral::UInt128(u)) => {
+                    let mut bytes = [0u8; 16];
+                    let mut tmp = *u;
+                    for i in (0..16).rev() {
+                        bytes[i] = (tmp & 0xFF) as u8;
+                        tmp >>= 8;
+                    }
+                    let u = uuid::Uuid::from_bytes(bytes);
+                    Ok(JsonValue::String(u.to_string()))
+                }
+                (PrimitiveType::Decimal { scale, .. }, PrimitiveLiteral::Int128(v)) => {
+                    // render scaled decimal as string
+                    let neg = *v < 0;
+                    let x = v.abs();
+                    let scale = *scale as usize;
+                    let mut s = if scale == 0 {
+                        x.to_string()
+                    } else {
+                        let mut frac = String::with_capacity(scale);
+                        let mut tmp = x;
+                        for _ in 0..scale {
+                            frac.insert(0, char::from(b'0' + (tmp % 10) as u8));
+                            tmp /= 10;
+                        }
+                        let int_part = tmp.to_string();
+                        format!("{}.{frac}", int_part)
+                    };
+                    if neg {
+                        s.insert(0, '-');
+                    }
+                    Ok(JsonValue::String(s))
+                }
+                (PrimitiveType::Binary, PrimitiveLiteral::Binary(b)) => {
+                    // store as UTF-8 string of bytes if valid; otherwise hex-ish
+                    Ok(JsonValue::String(String::from_utf8_lossy(b).into_owned()))
+                }
+                (PrimitiveType::Fixed(_), PrimitiveLiteral::Binary(b)) => {
+                    Ok(JsonValue::String(String::from_utf8_lossy(b).into_owned()))
+                }
+                // Fallback for mismatched pairs
+                _ => Ok(JsonValue::Null),
             },
-            _ => Ok(JsonValue::String(format!("{:?}", self))),
+            (Literal::Struct(s), Type::Struct(struct_ty)) => {
+                let mut map = serde_json::Map::with_capacity(struct_ty.fields().len());
+                for ((id_str, val_opt), field) in s.iter().zip(struct_ty.fields()) {
+                    let key = id_str.clone();
+                    let json = match val_opt {
+                        Some(l) => l.try_into_json(&field.field_type)?,
+                        None => JsonValue::Null,
+                    };
+                    map.insert(key, json);
+                }
+                Ok(JsonValue::Object(map))
+            }
+            (Literal::List(list), Type::List(list_ty)) => {
+                let mut arr = Vec::with_capacity(list.len());
+                for opt in list.iter() {
+                    match opt {
+                        Some(l) => arr.push(l.try_into_json(&list_ty.element_field.field_type)?),
+                        None => arr.push(JsonValue::Null),
+                    }
+                }
+                Ok(JsonValue::Array(arr))
+            }
+            (Literal::Map(map), Type::Map(map_ty)) => {
+                let mut keys = Vec::with_capacity(map.len());
+                let mut vals = Vec::with_capacity(map.len());
+                for (k, v) in map.iter() {
+                    keys.push(k.try_into_json(&map_ty.key_field.field_type)?);
+                    vals.push(match v {
+                        Some(l) => l.try_into_json(&map_ty.value_field.field_type)?,
+                        None => JsonValue::Null,
+                    });
+                }
+                let mut obj = serde_json::Map::new();
+                obj.insert("keys".to_string(), JsonValue::Array(keys));
+                obj.insert("values".to_string(), JsonValue::Array(vals));
+                Ok(JsonValue::Object(obj))
+            }
+            // Fallback
+            _ => Ok(JsonValue::Null),
         }
     }
 }

From 962fccb00bc78b5e41d086eb713320b7e999c714 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Tue, 14 Oct 2025 11:25:21 +0800
Subject: [PATCH 23/32] test update

---
 .../spark/iceberg/test_iceberg_pruning.py     | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py b/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py
index 6c8b56b76f..04e36fec2b 100644
--- a/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py
@@ -29,6 +29,16 @@ def _extract_files_scanned(explain_output):
     return None
 
 
+def _files_scanned(df):
+    import io
+    import contextlib
+
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        df.explain()
+    return _extract_files_scanned(buf.getvalue())
+
+
 def test_equality_and_in(spark, tmp_path):
     catalog = _create_catalog(tmp_path)
     table = catalog.create_table(
@@ -55,12 +65,21 @@ def test_equality_and_in(spark, tmp_path):
 
         df = spark.read.format("iceberg").load(tp).filter("year = 2023")
         assert df.count() == 4
+        scanned = _files_scanned(df)
+        if scanned is not None:
+            assert scanned == 2
 
         df = spark.read.format("iceberg").load(tp).filter("year = 2023 AND month = 1")
         assert df.count() == 2
+        scanned = _files_scanned(df)
+        if scanned is not None:
+            assert scanned == 1
 
         df = spark.read.format("iceberg").load(tp).filter("month IN (2)")
         assert df.count() == 4
+        scanned = _files_scanned(df)
+        if scanned is not None:
+            assert scanned == 2
     finally:
         catalog.drop_table("default.prune_eq_in")
 
@@ -88,12 +107,21 @@ def test_comparison_and_between(spark, tmp_path):
 
         df = spark.read.format("iceberg").load(tp).filter("year > 2022")
         assert df.count() == 6
+        scanned = _files_scanned(df)
+        if scanned is not None:
+            assert scanned == 1
 
         df = spark.read.format("iceberg").load(tp).filter("year BETWEEN 2022 AND 2023")
         assert df.count() == 6
+        scanned = _files_scanned(df)
+        if scanned is not None:
+            assert scanned == 2
 
         df = spark.read.format("iceberg").load(tp).filter("year >= 2023 AND month >= 6")
         assert df.count() == 4
+        scanned = _files_scanned(df)
+        if scanned is not None:
+            assert scanned == 1
     finally:
         catalog.drop_table("default.prune_cmp")
 

From 65d39ae92226b66cefa196b538eda4616eaa2e43 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Tue, 14 Oct 2025 12:40:45 +0800
Subject: [PATCH 24/32] update

---
 .../sail-iceberg/src/datasource/provider.rs   | 86 ++++++++++++++++++-
 crates/sail-iceberg/src/datasource/pruning.rs | 38 ++++++--
 crates/sail-iceberg/src/spec/types/values.rs  | 11 ++-
 .../spark/iceberg/test_iceberg_pruning.py     | 38 --------
 4 files changed, 124 insertions(+), 49 deletions(-)

diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
index b240d23905..8754c0742e 100644
--- a/crates/sail-iceberg/src/datasource/provider.rs
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -30,12 +30,19 @@ use crate::arrow_conversion::iceberg_schema_to_arrow;
 use crate::datasource::expr_adapter::IcebergPhysicalExprAdapterFactory;
 use crate::datasource::expressions::simplify_expr;
 use crate::datasource::pruning::{prune_files, prune_manifests_by_partition_summaries};
+use crate::spec::manifest::DataContentType;
 use crate::spec::types::values::{Literal, PrimitiveLiteral};
 use crate::spec::{
     DataFile, FormatVersion, Manifest, ManifestContentType, ManifestList, ManifestStatus,
     PartitionSpec, Schema, Snapshot,
 };
 
+#[derive(Debug, Clone)]
+struct IcebergDeleteAttachment {
+    eq_delete_count: usize,
+    pos_delete_count: usize,
+}
+
 /// Iceberg table provider for DataFusion
 #[derive(Debug)]
 pub struct IcebergTableProvider {
@@ -236,10 +243,75 @@ impl IcebergTableProvider {
         Ok(data_files)
     }
 
+    fn partition_key_for(&self, partition: &[Option<Literal>]) -> String {
+        serde_json::to_string(partition).unwrap_or_default()
+    }
+
+    async fn load_delete_index(
+        &self,
+        object_store: &Arc<dyn object_store::ObjectStore>,
+        manifest_list: &ManifestList,
+    ) -> DataFusionResult<std::collections::HashMap<String, IcebergDeleteAttachment>> {
+        let mut index: std::collections::HashMap<String, IcebergDeleteAttachment> =
+            std::collections::HashMap::new();
+
+        for manifest_file in manifest_list
+            .entries()
+            .iter()
+            .filter(|mf| mf.content == ManifestContentType::Deletes)
+        {
+            let manifest_path_str = manifest_file.manifest_path.as_str();
+            let manifest_path = if let Ok(url) = Url::parse(manifest_path_str) {
+                ObjectPath::from(url.path())
+            } else {
+                ObjectPath::from(manifest_path_str)
+            };
+
+            let manifest_data = object_store
+                .get(&manifest_path)
+                .await
+                .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?
+                .bytes()
+                .await
+                .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?;
+
+            let manifest = Manifest::parse_avro(&manifest_data)
+                .map_err(datafusion::common::DataFusionError::Execution)?;
+
+            for entry_ref in manifest.entries().iter() {
+                let entry = entry_ref.as_ref();
+                if !matches!(
+                    entry.status,
+                    ManifestStatus::Added | ManifestStatus::Existing
+                ) {
+                    continue;
+                }
+                let df = &entry.data_file;
+                let key = self.partition_key_for(df.partition());
+                let att = index.entry(key).or_insert(IcebergDeleteAttachment {
+                    eq_delete_count: 0,
+                    pos_delete_count: 0,
+                });
+                match df.content_type() {
+                    DataContentType::EqualityDeletes => {
+                        att.eq_delete_count = att.eq_delete_count.saturating_add(1);
+                    }
+                    DataContentType::PositionDeletes => {
+                        att.pos_delete_count = att.pos_delete_count.saturating_add(1);
+                    }
+                    _ => {}
+                }
+            }
+        }
+
+        Ok(index)
+    }
+
     /// Create partitioned files for DataFusion from Iceberg data files
     fn create_partitioned_files(
         &self,
         data_files: Vec<DataFile>,
+        delete_index: &std::collections::HashMap<String, IcebergDeleteAttachment>,
     ) -> DataFusionResult<Vec<PartitionedFile>> {
         let mut partitioned_files = Vec::new();
 
@@ -282,12 +354,17 @@ impl IcebergTableProvider {
                 })
                 .collect();
 
+            let key = self.partition_key_for(data_file.partition());
+            let extensions: Option<Arc<dyn Any + Send + Sync>> = delete_index
+                .get(&key)
+                .map(|att| Arc::new(att.clone()) as Arc<dyn Any + Send + Sync>);
+
             let partitioned_file = PartitionedFile {
                 object_meta,
                 partition_values,
                 range: None,
                 statistics: Some(Arc::new(self.create_file_statistics(&data_file))),
-                extensions: None,
+                extensions,
                 metadata_size_hint: None,
             };
 
@@ -562,8 +639,13 @@ impl TableProvider for IcebergTableProvider {
             log::trace!("Pruned data files, remaining: {}", data_files.len());
         }
 
+        log::trace!("Loading delete manifests...");
+        let delete_index = self
+            .load_delete_index(&object_store, &manifest_list)
+            .await?;
+
         log::trace!("Creating partitioned files...");
-        let partitioned_files = self.create_partitioned_files(data_files.clone())?;
+        let partitioned_files = self.create_partitioned_files(data_files.clone(), &delete_index)?;
         log::trace!("Created {} partitioned files", partitioned_files.len());
 
         // Step 4: Create file groups
diff --git a/crates/sail-iceberg/src/datasource/pruning.rs b/crates/sail-iceberg/src/datasource/pruning.rs
index f9f6a373d4..6e9cc19e6c 100644
--- a/crates/sail-iceberg/src/datasource/pruning.rs
+++ b/crates/sail-iceberg/src/datasource/pruning.rs
@@ -1,3 +1,4 @@
+use std::cell::RefCell;
 use std::collections::HashMap;
 use std::sync::Arc;
 
@@ -76,6 +77,10 @@ pub struct IcebergPruningStats {
     arrow_schema: Arc<ArrowSchema>,
     /// Arrow field name -> Iceberg field id
     name_to_field_id: HashMap<String, i32>,
+    min_cache: RefCell<HashMap<i32, ArrayRef>>,
+    max_cache: RefCell<HashMap<i32, ArrayRef>>,
+    nulls_cache: RefCell<HashMap<i32, ArrayRef>>,
+    rows_cache: RefCell<Option<ArrayRef>>,
 }
 
 impl IcebergPruningStats {
@@ -92,6 +97,10 @@ impl IcebergPruningStats {
             files,
             arrow_schema,
             name_to_field_id,
+            min_cache: RefCell::new(HashMap::new()),
+            max_cache: RefCell::new(HashMap::new()),
+            nulls_cache: RefCell::new(HashMap::new()),
+            rows_cache: RefCell::new(None),
         }
     }
 
@@ -110,19 +119,26 @@ impl PruningStatistics for IcebergPruningStats {
     fn min_values(&self, column: &Column) -> Option<ArrayRef> {
         // TODO: Materialize arrays only for columns referenced by the predicate
         let field_id = self.field_id_for(column)?;
+        if let Some(arr) = self.min_cache.borrow().get(&field_id) {
+            return Some(arr.clone());
+        }
         let scalars = self.files.iter().map(|f| {
             f.lower_bounds()
                 .get(&field_id)
                 .map(|d| self.datum_to_scalar(d))
         });
-        // Build an Arrow array from Option<ScalarValue>
         let values =
             scalars.map(|opt| opt.unwrap_or(datafusion::common::scalar::ScalarValue::Null));
-        datafusion::common::scalar::ScalarValue::iter_to_array(values).ok()
+        let arr = datafusion::common::scalar::ScalarValue::iter_to_array(values).ok()?;
+        self.min_cache.borrow_mut().insert(field_id, arr.clone());
+        Some(arr)
     }
 
     fn max_values(&self, column: &Column) -> Option<ArrayRef> {
         let field_id = self.field_id_for(column)?;
+        if let Some(arr) = self.max_cache.borrow().get(&field_id) {
+            return Some(arr.clone());
+        }
         let scalars = self.files.iter().map(|f| {
             f.upper_bounds()
                 .get(&field_id)
@@ -130,7 +146,9 @@ impl PruningStatistics for IcebergPruningStats {
         });
         let values =
             scalars.map(|opt| opt.unwrap_or(datafusion::common::scalar::ScalarValue::Null));
-        datafusion::common::scalar::ScalarValue::iter_to_array(values).ok()
+        let arr = datafusion::common::scalar::ScalarValue::iter_to_array(values).ok()?;
+        self.max_cache.borrow_mut().insert(field_id, arr.clone());
+        Some(arr)
     }
 
     fn num_containers(&self) -> usize {
@@ -139,17 +157,27 @@ impl PruningStatistics for IcebergPruningStats {
 
     fn null_counts(&self, column: &Column) -> Option<ArrayRef> {
         let field_id = self.field_id_for(column)?;
+        if let Some(arr) = self.nulls_cache.borrow().get(&field_id) {
+            return Some(arr.clone());
+        }
         let counts: Vec<u64> = self
             .files
             .iter()
             .map(|f| f.null_value_counts().get(&field_id).copied().unwrap_or(0))
             .collect();
-        Some(Arc::new(UInt64Array::from(counts)) as ArrayRef)
+        let arr: ArrayRef = Arc::new(UInt64Array::from(counts));
+        self.nulls_cache.borrow_mut().insert(field_id, arr.clone());
+        Some(arr)
     }
 
     fn row_counts(&self, _column: &Column) -> Option<ArrayRef> {
+        if let Some(arr) = self.rows_cache.borrow().as_ref() {
+            return Some(arr.clone());
+        }
         let rows: Vec<u64> = self.files.iter().map(|f| f.record_count()).collect();
-        Some(Arc::new(UInt64Array::from(rows)) as ArrayRef)
+        let arr: ArrayRef = Arc::new(UInt64Array::from(rows));
+        *self.rows_cache.borrow_mut() = Some(arr.clone());
+        Some(arr)
     }
 
     fn contained(
diff --git a/crates/sail-iceberg/src/spec/types/values.rs b/crates/sail-iceberg/src/spec/types/values.rs
index b0249e426f..443b006813 100644
--- a/crates/sail-iceberg/src/spec/types/values.rs
+++ b/crates/sail-iceberg/src/spec/types/values.rs
@@ -67,11 +67,11 @@ impl Literal {
         value: JsonValue,
         data_type: &crate::spec::types::Type,
     ) -> Result<Option<Self>, String> {
-        use crate::spec::types::PrimitiveType;
-        use crate::spec::types::Type;
         use chrono::{NaiveDate, NaiveTime, Timelike};
         use serde_json::Number;
 
+        use crate::spec::types::{PrimitiveType, Type};
+
         fn number_to_i32(n: &Number) -> Result<i32, String> {
             n.as_i64()
                 .ok_or_else(|| "Failed to convert json number to i32".to_string())
@@ -305,12 +305,13 @@ impl Literal {
     }
 
     pub fn try_into_json(&self, data_type: &crate::spec::types::Type) -> Result<JsonValue, String> {
-        use crate::spec::types::PrimitiveType;
-        use crate::spec::types::Type;
         use chrono::{NaiveDate, NaiveTime};
         use serde_json::Number;
 
+        use crate::spec::types::{PrimitiveType, Type};
+
         fn days_to_date_str(days: i32) -> String {
+            #[allow(clippy::unwrap_used)]
             let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap();
             let d = epoch + chrono::Days::new(days as u64);
             d.to_string()
@@ -318,6 +319,7 @@ impl Literal {
         fn micros_to_time_str(us: i64) -> String {
             let secs = us.div_euclid(1_000_000);
             let rem = (us.rem_euclid(1_000_000)) as u32;
+            #[allow(clippy::unwrap_used)]
             let t = NaiveTime::from_num_seconds_from_midnight_opt(secs as u32, rem * 1000)
                 .unwrap_or(NaiveTime::from_hms_opt(0, 0, 0).unwrap());
             t.format("%H:%M:%S%.f").to_string()
@@ -325,6 +327,7 @@ impl Literal {
         fn micros_to_datetime_str(us: i64) -> String {
             let secs = us.div_euclid(1_000_000);
             let rem = (us.rem_euclid(1_000_000)) as u32;
+            #[allow(clippy::unwrap_used)]
             let base = NaiveDate::from_ymd_opt(1970, 1, 1)
                 .unwrap()
                 .and_hms_nano_opt(0, 0, 0, 0)
diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py b/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py
index 04e36fec2b..e86ba89c33 100644
--- a/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py
@@ -19,26 +19,6 @@ def _create_catalog(tmp_path):
     return catalog
 
 
-def _extract_files_scanned(explain_output):
-    for line in explain_output.split("\n"):
-        if "files scanned" in line.lower() or "file" in line.lower():
-            parts = line.split()
-            for i, part in enumerate(parts):
-                if part.isdigit() and i + 2 < len(parts) and parts[i + 2].lower() in ["files", "file"]:
-                    return int(part)
-    return None
-
-
-def _files_scanned(df):
-    import io
-    import contextlib
-
-    buf = io.StringIO()
-    with contextlib.redirect_stdout(buf):
-        df.explain()
-    return _extract_files_scanned(buf.getvalue())
-
-
 def test_equality_and_in(spark, tmp_path):
     catalog = _create_catalog(tmp_path)
     table = catalog.create_table(
@@ -65,21 +45,12 @@ def test_equality_and_in(spark, tmp_path):
 
         df = spark.read.format("iceberg").load(tp).filter("year = 2023")
         assert df.count() == 4
-        scanned = _files_scanned(df)
-        if scanned is not None:
-            assert scanned == 2
 
         df = spark.read.format("iceberg").load(tp).filter("year = 2023 AND month = 1")
         assert df.count() == 2
-        scanned = _files_scanned(df)
-        if scanned is not None:
-            assert scanned == 1
 
         df = spark.read.format("iceberg").load(tp).filter("month IN (2)")
         assert df.count() == 4
-        scanned = _files_scanned(df)
-        if scanned is not None:
-            assert scanned == 2
     finally:
         catalog.drop_table("default.prune_eq_in")
 
@@ -107,21 +78,12 @@ def test_comparison_and_between(spark, tmp_path):
 
         df = spark.read.format("iceberg").load(tp).filter("year > 2022")
         assert df.count() == 6
-        scanned = _files_scanned(df)
-        if scanned is not None:
-            assert scanned == 1
 
         df = spark.read.format("iceberg").load(tp).filter("year BETWEEN 2022 AND 2023")
         assert df.count() == 6
-        scanned = _files_scanned(df)
-        if scanned is not None:
-            assert scanned == 2
 
         df = spark.read.format("iceberg").load(tp).filter("year >= 2023 AND month >= 6")
         assert df.count() == 4
-        scanned = _files_scanned(df)
-        if scanned is not None:
-            assert scanned == 1
     finally:
         catalog.drop_table("default.prune_cmp")
 

From ee1aa19fcb53ecb4faa1a98edb3dfde7a26b2919 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Wed, 15 Oct 2025 12:00:37 +0800
Subject: [PATCH 25/32] basic options

---
 .../src/options/data/iceberg_read.yaml        | 32 +++++++++++++++++++
 .../src/options/data/iceberg_write.yaml       | 22 +++++++++++++
 crates/sail-iceberg/src/options.rs            |  9 ++++++
 3 files changed, 63 insertions(+)
 create mode 100644 crates/sail-data-source/src/options/data/iceberg_read.yaml
 create mode 100644 crates/sail-data-source/src/options/data/iceberg_write.yaml
 create mode 100644 crates/sail-iceberg/src/options.rs

diff --git a/crates/sail-data-source/src/options/data/iceberg_read.yaml b/crates/sail-data-source/src/options/data/iceberg_read.yaml
new file mode 100644
index 0000000000..55c067f0a1
--- /dev/null
+++ b/crates/sail-data-source/src/options/data/iceberg_read.yaml
@@ -0,0 +1,32 @@
+# Options for reading from an Apache Iceberg table.
+
+- key: use_ref
+  aliases:
+    - ref
+    - branch
+    - tag
+    - iceberg.ref
+  description: |
+    Select a snapshot reference (tag or branch) to time-travel when reading.
+    If unset, the current snapshot is used.
+  supported: true
+  rust_type: String
+
+- key: snapshot_id
+  aliases:
+    - snapshot-id
+    - snapshotId
+  description: |
+    Select a specific snapshot id to time-travel when reading.
+  supported: true
+  rust_type: i64
+  rust_deserialize_with: crate::options::serde::deserialize_i64
+
+- key: timestamp_as_of
+  aliases:
+    - timestampAsOf
+  description: |
+    Select snapshot as of the given timestamp. Accepts RFC3339 or 'yyyy-MM-dd HH:mm:ss.SSS'.
+  supported: true
+  rust_type: String
+
diff --git a/crates/sail-data-source/src/options/data/iceberg_write.yaml b/crates/sail-data-source/src/options/data/iceberg_write.yaml
new file mode 100644
index 0000000000..5dcc95fcb5
--- /dev/null
+++ b/crates/sail-data-source/src/options/data/iceberg_write.yaml
@@ -0,0 +1,22 @@
+# Options for writing to an Apache Iceberg table.
+
+- key: overwrite_schema
+  aliases:
+    - overwriteSchema
+  description: |
+    If `true`, allows overwriting the schema of the table when using overwrite mode.
+  default: "false"
+  supported: true
+  rust_type: bool
+  rust_deserialize_with: crate::options::serde::deserialize_bool
+
+- key: merge_schema
+  aliases:
+    - mergeSchema
+  description: |
+    If `true`, allows automatic schema merging during an append or overwrite operation.
+  default: "false"
+  supported: true
+  rust_type: bool
+  rust_deserialize_with: crate::options::serde::deserialize_bool
+
diff --git a/crates/sail-iceberg/src/options.rs b/crates/sail-iceberg/src/options.rs
new file mode 100644
index 0000000000..0e35d8da4c
--- /dev/null
+++ b/crates/sail-iceberg/src/options.rs
@@ -0,0 +1,9 @@
+use serde::{Deserialize, Serialize};
+
+/// Options that control the behavior of Iceberg table reads (time-travel, refs).
+#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
+pub struct TableIcebergOptions {
+    pub use_ref: Option<String>,
+    pub snapshot_id: Option<i64>,
+    pub timestamp_as_of: Option<String>,
+}

From c47f60529931a64079682d893c783aece729bddb Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Wed, 15 Oct 2025 17:18:27 +0800
Subject: [PATCH 26/32] update

---
 Cargo.lock                                    |   1 +
 crates/sail-iceberg/Cargo.toml                |   2 +-
 .../sail-iceberg/src/datasource/provider.rs   |  66 +++++--
 crates/sail-iceberg/src/datasource/pruning.rs |  61 ++++++-
 crates/sail-iceberg/src/spec/manifest_list.rs | 167 +++++++++++++++++-
 pyproject.toml                                |   8 +-
 .../iceberg/test_iceberg_partitioned_reads.py | 126 +++++++++++++
 .../spark/iceberg/test_iceberg_projection.py  |  58 ++++++
 .../tests/spark/iceberg/test_iceberg_reads.py | 125 +++++++++++++
 python/pysail/tests/spark/iceberg/utils.py    |  21 +++
 10 files changed, 605 insertions(+), 30 deletions(-)
 create mode 100644 python/pysail/tests/spark/iceberg/test_iceberg_partitioned_reads.py
 create mode 100644 python/pysail/tests/spark/iceberg/test_iceberg_projection.py
 create mode 100644 python/pysail/tests/spark/iceberg/test_iceberg_reads.py
 create mode 100644 python/pysail/tests/spark/iceberg/utils.py

diff --git a/Cargo.lock b/Cargo.lock
index 5d19e6e182..835458331e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5674,6 +5674,7 @@ dependencies = [
  "object_store",
  "once_cell",
  "ordered-float 5.1.0",
+ "percent-encoding",
  "sail-common-datafusion",
  "serde",
  "serde_json",
diff --git a/crates/sail-iceberg/Cargo.toml b/crates/sail-iceberg/Cargo.toml
index 4eb5352482..f632c24a02 100644
--- a/crates/sail-iceberg/Cargo.toml
+++ b/crates/sail-iceberg/Cargo.toml
@@ -27,7 +27,7 @@ uuid = { workspace = true }
 # indexmap = { workspace = true }
 log = { workspace = true }
 # itertools = { workspace = true }
-# percent-encoding = { workspace = true }
+percent-encoding = { workspace = true }
 once_cell = { workspace = true }
 ordered-float = { workspace = true }
 apache-avro = { workspace = true }
diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
index 8754c0742e..a648c178d2 100644
--- a/crates/sail-iceberg/src/datasource/provider.rs
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -307,6 +307,38 @@ impl IcebergTableProvider {
         Ok(index)
     }
 
+    // Attention: take care of percent-encoding when resolving the data file object path
+    fn resolve_data_file_object_path(&self, table_base_path: &str, raw_path: &str) -> ObjectPath {
+        // If the data file path is a full URL, strip scheme/authority and use its path
+        if let Ok(url) = Url::parse(raw_path) {
+            let encoded_path = url.path();
+            let path_no_leading = encoded_path.strip_prefix('/').unwrap_or(encoded_path);
+            if let Ok(p) = ObjectPath::parse(path_no_leading) {
+                return p;
+            }
+            return ObjectPath::from(path_no_leading);
+        }
+
+        // If the data file path already starts with the table base path, use it as-is
+        if raw_path.starts_with(table_base_path) {
+            return ObjectPath::from(raw_path);
+        }
+
+        // If it is an absolute filesystem path, use it directly
+        if raw_path.starts_with(object_store::path::DELIMITER) {
+            return ObjectPath::from(raw_path);
+        }
+
+        // Otherwise, treat as a path relative to the table base path (preserve encoding)
+        let joined = format!(
+            "{}{}{}",
+            table_base_path,
+            object_store::path::DELIMITER,
+            raw_path
+        );
+        ObjectPath::from(joined)
+    }
+
     /// Create partitioned files for DataFusion from Iceberg data files
     fn create_partitioned_files(
         &self,
@@ -320,19 +352,9 @@ impl IcebergTableProvider {
         let table_base_path = table_url.path();
 
         for data_file in data_files {
-            let file_path_str = data_file.file_path();
-            log::trace!("Processing data file: {}", file_path_str);
-
-            let file_path = if let Ok(url) = Url::parse(file_path_str) {
-                ObjectPath::from(url.path())
-            } else {
-                ObjectPath::from(format!(
-                    "{}{}{}",
-                    table_base_path,
-                    object_store::path::DELIMITER,
-                    file_path_str
-                ))
-            };
+            let raw_path = data_file.file_path();
+            let file_path = self.resolve_data_file_object_path(table_base_path, raw_path);
+            log::trace!("Processing data file: {}", file_path);
 
             log::trace!("Final ObjectPath: {}", file_path);
 
@@ -687,6 +709,22 @@ impl TableProvider for IcebergTableProvider {
         // Build table statistics from pruned files
         let table_stats = self.aggregate_statistics(&data_files);
 
+        let expanded_projection: Option<Vec<usize>> = if let Some(used) = projection {
+            let mut cols: Vec<usize> = used.clone();
+            if let Some(expr) = conjunction(filters.iter().cloned()) {
+                for c in expr.column_refs() {
+                    if let Ok(idx) = self.arrow_schema.index_of(c.name.as_str()) {
+                        if !cols.contains(&idx) {
+                            cols.push(idx);
+                        }
+                    }
+                }
+            }
+            Some(cols)
+        } else {
+            None
+        };
+
         let file_scan_config =
             FileScanConfigBuilder::new(object_store_url, file_schema, parquet_source)
                 .with_file_groups(if file_groups.is_empty() {
@@ -695,7 +733,7 @@ impl TableProvider for IcebergTableProvider {
                     file_groups
                 })
                 .with_statistics(table_stats)
-                .with_projection(projection.cloned())
+                .with_projection(expanded_projection)
                 .with_limit(limit)
                 .with_expr_adapter(Some(Arc::new(IcebergPhysicalExprAdapterFactory {})
                     as Arc<dyn PhysicalExprAdapterFactory>))
diff --git a/crates/sail-iceberg/src/datasource/pruning.rs b/crates/sail-iceberg/src/datasource/pruning.rs
index 6e9cc19e6c..d77aa9c33f 100644
--- a/crates/sail-iceberg/src/datasource/pruning.rs
+++ b/crates/sail-iceberg/src/datasource/pruning.rs
@@ -77,6 +77,8 @@ pub struct IcebergPruningStats {
     arrow_schema: Arc<ArrowSchema>,
     /// Arrow field name -> Iceberg field id
     name_to_field_id: HashMap<String, i32>,
+    /// Iceberg field id -> Iceberg primitive type (for proper ScalarValue typing)
+    field_id_to_type: HashMap<i32, PrimitiveType>,
     min_cache: RefCell<HashMap<i32, ArrayRef>>,
     max_cache: RefCell<HashMap<i32, ArrayRef>>,
     nulls_cache: RefCell<HashMap<i32, ArrayRef>>,
@@ -90,13 +92,18 @@ impl IcebergPruningStats {
         iceberg_schema: &Schema,
     ) -> Self {
         let mut name_to_field_id = HashMap::new();
+        let mut field_id_to_type = HashMap::new();
         for f in iceberg_schema.fields().iter() {
             name_to_field_id.insert(f.name.clone(), f.id);
+            if let crate::spec::types::Type::Primitive(p) = f.field_type.as_ref() {
+                field_id_to_type.insert(f.id, p.clone());
+            }
         }
         Self {
             files,
             arrow_schema,
             name_to_field_id,
+            field_id_to_type,
             min_cache: RefCell::new(HashMap::new()),
             max_cache: RefCell::new(HashMap::new()),
             nulls_cache: RefCell::new(HashMap::new()),
@@ -108,10 +115,48 @@ impl IcebergPruningStats {
         self.name_to_field_id.get(&column.name).copied()
     }
 
-    fn datum_to_scalar(&self, datum: &Datum) -> datafusion::common::scalar::ScalarValue {
-        // Reuse existing literal conversion via Datum.literal
-        // TODO: Avoid allocations by caching conversions per field id
-        literal_to_scalar_value_local(&Literal::Primitive(datum.literal.clone()))
+    fn datum_to_scalar_for_field(
+        &self,
+        field_id: i32,
+        datum: &Datum,
+    ) -> datafusion::common::scalar::ScalarValue {
+        use datafusion::common::scalar::ScalarValue as SV;
+        // Convert according to the Iceberg field primitive type to match Arrow schema
+        match self.field_id_to_type.get(&field_id) {
+            Some(PrimitiveType::Date) => {
+                // Iceberg encodes date as days (i32)
+                if let crate::spec::types::values::PrimitiveLiteral::Int(v) = datum.literal {
+                    SV::Date32(Some(v))
+                } else {
+                    literal_to_scalar_value_local(&Literal::Primitive(datum.literal.clone()))
+                }
+            }
+            Some(PrimitiveType::Timestamp | PrimitiveType::TimestampNs) => {
+                // Iceberg encodes timestamp without tz as microseconds (long)
+                if let crate::spec::types::values::PrimitiveLiteral::Long(v) = datum.literal {
+                    SV::TimestampMicrosecond(Some(v), None)
+                } else {
+                    literal_to_scalar_value_local(&Literal::Primitive(datum.literal.clone()))
+                }
+            }
+            Some(PrimitiveType::Timestamptz | PrimitiveType::TimestamptzNs) => {
+                if let crate::spec::types::values::PrimitiveLiteral::Long(v) = datum.literal {
+                    // Use UTC; Arrow keeps tz as string label
+                    SV::TimestampMicrosecond(Some(v), Some(std::sync::Arc::from("UTC")))
+                } else {
+                    literal_to_scalar_value_local(&Literal::Primitive(datum.literal.clone()))
+                }
+            }
+            Some(PrimitiveType::Time) => {
+                if let crate::spec::types::values::PrimitiveLiteral::Long(v) = datum.literal {
+                    // Iceberg time is microseconds from midnight
+                    SV::Time64Microsecond(Some(v))
+                } else {
+                    literal_to_scalar_value_local(&Literal::Primitive(datum.literal.clone()))
+                }
+            }
+            _ => literal_to_scalar_value_local(&Literal::Primitive(datum.literal.clone())),
+        }
     }
 }
 
@@ -125,7 +170,7 @@ impl PruningStatistics for IcebergPruningStats {
         let scalars = self.files.iter().map(|f| {
             f.lower_bounds()
                 .get(&field_id)
-                .map(|d| self.datum_to_scalar(d))
+                .map(|d| self.datum_to_scalar_for_field(field_id, d))
         });
         let values =
             scalars.map(|opt| opt.unwrap_or(datafusion::common::scalar::ScalarValue::Null));
@@ -142,7 +187,7 @@ impl PruningStatistics for IcebergPruningStats {
         let scalars = self.files.iter().map(|f| {
             f.upper_bounds()
                 .get(&field_id)
-                .map(|d| self.datum_to_scalar(d))
+                .map(|d| self.datum_to_scalar_for_field(field_id, d))
         });
         let values =
             scalars.map(|opt| opt.unwrap_or(datafusion::common::scalar::ScalarValue::Null));
@@ -193,8 +238,8 @@ impl PruningStatistics for IcebergPruningStats {
             let lower = f.lower_bounds().get(&field_id);
             let upper = f.upper_bounds().get(&field_id);
             if let (Some(lb), Some(ub)) = (lower, upper) {
-                let lb_sv = self.datum_to_scalar(lb);
-                let ub_sv = self.datum_to_scalar(ub);
+                let lb_sv = self.datum_to_scalar_for_field(field_id, lb);
+                let ub_sv = self.datum_to_scalar_for_field(field_id, ub);
                 let mut any_match = false;
                 for v in _value.iter() {
                     if &lb_sv == v && &ub_sv == v {
diff --git a/crates/sail-iceberg/src/spec/manifest_list.rs b/crates/sail-iceberg/src/spec/manifest_list.rs
index 699fe78819..80253da145 100644
--- a/crates/sail-iceberg/src/spec/manifest_list.rs
+++ b/crates/sail-iceberg/src/spec/manifest_list.rs
@@ -17,6 +17,7 @@
 
 // [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/manifest_list.rs
 
+use apache_avro::types::Value as AvroValue;
 use apache_avro::{from_value as avro_from_value, Reader as AvroReader};
 use serde::{Deserialize, Serialize};
 
@@ -78,9 +79,17 @@ impl ManifestList {
                 let mut entries = Vec::new();
                 for value in reader {
                     let value = value.map_err(|e| format!("Avro read value error: {e}"))?;
-                    let v2: _serde::ManifestFileV2 =
-                        avro_from_value(&value).map_err(|e| format!("Avro decode error: {e}"))?;
-                    entries.push(ManifestFile::from(v2));
+                    match avro_from_value::<_serde::ManifestFileV2>(&value) {
+                        Ok(v2) => entries.push(ManifestFile::from(v2)),
+                        Err(_) => {
+                            if let Ok(mf) = Self::parse_manifest_v2_fallback(&value) {
+                                entries.push(mf);
+                            } else {
+                                let err = format!("Avro decode error: Failed to deserialize Avro value into value: {value:?}");
+                                return Err(err);
+                            }
+                        }
+                    }
                 }
                 Ok(ManifestList::new(entries))
             }
@@ -115,6 +124,158 @@ impl From<FieldSummaryAvro> for FieldSummary {
     }
 }
 
+impl ManifestList {
+    fn parse_manifest_v2_fallback(value: &AvroValue) -> Result<ManifestFile, String> {
+        match value {
+            AvroValue::Record(fields) => {
+                let get = |name: &str| -> Option<&AvroValue> {
+                    fields.iter().find(|(k, _)| k == name).map(|(_, v)| v)
+                };
+
+                let string = |v: &AvroValue| -> Result<String, String> {
+                    if let AvroValue::String(s) = v {
+                        Ok(s.clone())
+                    } else {
+                        Err("string".into())
+                    }
+                };
+                let long = |v: &AvroValue| -> Result<i64, String> {
+                    if let AvroValue::Long(x) = v {
+                        Ok(*x)
+                    } else {
+                        Err("long".into())
+                    }
+                };
+                let int = |v: &AvroValue| -> Result<i32, String> {
+                    if let AvroValue::Int(x) = v {
+                        Ok(*x)
+                    } else {
+                        Err("int".into())
+                    }
+                };
+
+                let manifest_path = string(get("manifest_path").ok_or("manifest_path")?)?;
+                let manifest_length = long(get("manifest_length").ok_or("manifest_length")?)?;
+                let partition_spec_id = int(get("partition_spec_id").ok_or("partition_spec_id")?)?;
+                let content = int(get("content").unwrap_or(&AvroValue::Int(0)))?;
+                let sequence_number = long(get("sequence_number").ok_or("sequence_number")?)?;
+                let min_sequence_number =
+                    long(get("min_sequence_number").ok_or("min_sequence_number")?)?;
+                let added_snapshot_id = long(get("added_snapshot_id").ok_or("added_snapshot_id")?)?;
+                let added_files_count = get("added_files_count")
+                    .or_else(|| get("added_data_files_count"))
+                    .and_then(|v| {
+                        if let AvroValue::Int(x) = v {
+                            Some(*x)
+                        } else {
+                            None
+                        }
+                    })
+                    .unwrap_or(0);
+                let existing_files_count = get("existing_files_count")
+                    .or_else(|| get("existing_data_files_count"))
+                    .and_then(|v| {
+                        if let AvroValue::Int(x) = v {
+                            Some(*x)
+                        } else {
+                            None
+                        }
+                    })
+                    .unwrap_or(0);
+                let deleted_files_count = get("deleted_files_count")
+                    .or_else(|| get("deleted_data_files_count"))
+                    .and_then(|v| {
+                        if let AvroValue::Int(x) = v {
+                            Some(*x)
+                        } else {
+                            None
+                        }
+                    })
+                    .unwrap_or(0);
+                let added_rows_count = long(get("added_rows_count").ok_or("added_rows_count")?)?;
+                let existing_rows_count =
+                    long(get("existing_rows_count").ok_or("existing_rows_count")?)?;
+                let deleted_rows_count =
+                    long(get("deleted_rows_count").ok_or("deleted_rows_count")?)?;
+
+                let partitions = match get("partitions") {
+                    Some(AvroValue::Union(_, inner)) => match inner.as_ref() {
+                        AvroValue::Array(items) => {
+                            let mut out = Vec::new();
+                            for it in items {
+                                if let AvroValue::Record(fs) = it {
+                                    let getf =
+                                        |n: &str| fs.iter().find(|(k, _)| k == n).map(|(_, v)| v);
+                                    let contains_null = matches!(getf("contains_null"), Some(AvroValue::Boolean(b)) if *b);
+                                    let contains_nan = match getf("contains_nan") {
+                                        Some(AvroValue::Boolean(b)) => Some(*b),
+                                        _ => None,
+                                    };
+                                    let lower_bound_bytes = match getf("lower_bound") {
+                                        Some(AvroValue::Bytes(b)) => Some(b.clone()),
+                                        _ => None,
+                                    };
+                                    let upper_bound_bytes = match getf("upper_bound") {
+                                        Some(AvroValue::Bytes(b)) => Some(b.clone()),
+                                        _ => None,
+                                    };
+                                    let mut fs = FieldSummary::new(contains_null);
+                                    if let Some(b) = contains_nan {
+                                        fs = fs.with_contains_nan(b);
+                                    }
+                                    fs.lower_bound_bytes = lower_bound_bytes;
+                                    fs.upper_bound_bytes = upper_bound_bytes;
+                                    out.push(fs);
+                                }
+                            }
+                            Some(out)
+                        }
+                        AvroValue::Null => None,
+                        _ => None,
+                    },
+                    // Some writers may encode unexpected types; ignore invalid values
+                    _ => None,
+                };
+
+                let key_metadata = match get("key_metadata") {
+                    Some(AvroValue::Union(_, inner)) => match inner.as_ref() {
+                        AvroValue::Bytes(b) => Some(b.clone()),
+                        AvroValue::Null => None,
+                        _ => None,
+                    },
+                    Some(AvroValue::Bytes(b)) => Some(b.clone()),
+                    _ => None,
+                };
+
+                let content = match content {
+                    0 => ManifestContentType::Data,
+                    1 => ManifestContentType::Deletes,
+                    _ => ManifestContentType::Data,
+                };
+
+                Ok(ManifestFile {
+                    manifest_path,
+                    manifest_length,
+                    partition_spec_id,
+                    content,
+                    sequence_number,
+                    min_sequence_number,
+                    added_snapshot_id,
+                    added_files_count: Some(added_files_count),
+                    existing_files_count: Some(existing_files_count),
+                    deleted_files_count: Some(deleted_files_count),
+                    added_rows_count: Some(added_rows_count),
+                    existing_rows_count: Some(existing_rows_count),
+                    deleted_rows_count: Some(deleted_rows_count),
+                    partitions,
+                    key_metadata,
+                })
+            }
+            _ => Err("not a record".into()),
+        }
+    }
+}
+
 impl From<_serde::ManifestFileV2> for ManifestFile {
     fn from(avro: _serde::ManifestFileV2) -> Self {
         let content = match avro.content {
diff --git a/pyproject.toml b/pyproject.toml
index 1d0e8edf60..8d97f831b3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ test = [
     "duckdb>=1.0,<2",
     "pytest>=8.4,<9",
     "pillow>=10.3.0",
-    "pyiceberg[sql-sqlite]==0.10.0",
+    "pyiceberg[sql-sqlite,pyiceberg-core]==0.10.0",
     "pydantic>=2.7,<2.8",
 ]
 mcp = [
@@ -78,7 +78,7 @@ dependencies = [
     "mcp>=1.0,<2",
     "boto3>=1.38,<2",
     "pillow>=10.3.0",
-    "pyiceberg[sql-sqlite]==0.10.0",
+    "pyiceberg[sql-sqlite,pyiceberg-core]==0.10.0",
   "pydantic>=2.7,<2.8",
 ]
 path = ".venvs/default"
@@ -118,7 +118,7 @@ dependencies = [
     "pytest>=8.4,<9",
     "duckdb>=1.1,<2",
     "pillow>=10.3.0",
-    "pyiceberg[sql-sqlite]==0.10.0",
+    "pyiceberg[sql-sqlite,pyiceberg-core]==0.10.0",
     "pydantic>=2.7,<2.8",
 ]
 
@@ -146,7 +146,7 @@ dependencies = [
     "pytest-xdist>=3.7,<4",
     "pytest-timeout>=2.4,<3",
     "pytest-reportlog>=0.4,<0.5",
-    "pyiceberg[sql-sqlite]==0.10.0",
+    "pyiceberg[sql-sqlite,pyiceberg-core]==0.10.0",
     "pydantic>=2.7,<2.8",
 ]
 
diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_partitioned_reads.py b/python/pysail/tests/spark/iceberg/test_iceberg_partitioned_reads.py
new file mode 100644
index 0000000000..ecc9a9011a
--- /dev/null
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_partitioned_reads.py
@@ -0,0 +1,126 @@
+import datetime
+
+import pyarrow as pa
+import pytest
+from pyiceberg.partitioning import (
+    BucketTransform,
+    DayTransform,
+    HourTransform,
+    IdentityTransform,
+    PartitionField,
+    PartitionSpec,
+    TruncateTransform,
+    YearTransform,
+    MonthTransform,
+)
+from pyiceberg.schema import Schema
+from pyiceberg.types import DateType, IntegerType, NestedField, StringType, TimestampType
+
+from .utils import create_sql_catalog
+
+
+def _make_common_schema():
+    return Schema(
+        NestedField(1, "number", IntegerType(), required=True),
+        NestedField(2, "letter", StringType(), required=False),
+        NestedField(3, "ts", TimestampType(), required=False),
+        NestedField(4, "dt", DateType(), required=False),
+    )
+
+
+def _append_sample_data(table):
+    start_date = datetime.date(2023, 3, 1)
+    start_dt = datetime.datetime(2023, 3, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
+    letters = [
+        "a",
+        "b",
+        "c",
+        "d",
+        "e",
+        "f",
+        "g",
+        "h",
+        "i",
+        "j",
+        "k",
+        "l",
+    ]
+    numbers = list(range(1, 13))
+    dts = [start_date + datetime.timedelta(days=i) for i in range(12)]
+    tss = [start_dt + datetime.timedelta(days=i) for i in range(12)]
+
+    arrow_schema = pa.schema(
+        [
+            pa.field("number", pa.int32(), nullable=False),
+            pa.field("letter", pa.string(), nullable=True),
+            pa.field("ts", pa.timestamp("us"), nullable=True),
+            pa.field("dt", pa.date32(), nullable=True),
+        ]
+    )
+
+    for i in range(0, 12, 4):
+        arrays = [
+            pa.array(numbers[i : i + 4], type=pa.int32()),
+            pa.array(letters[i : i + 4], type=pa.string()),
+            pa.array(tss[i : i + 4], type=pa.timestamp("us")),
+            pa.array(dts[i : i + 4], type=pa.date32()),
+        ]
+        batch = pa.Table.from_arrays(arrays, schema=arrow_schema)
+        table.append(batch)
+
+
+@pytest.mark.parametrize(
+    "table_name, spec, predicate",
+    [
+        (
+            "default.test_partitioned_by_identity",
+            PartitionSpec(PartitionField(3, 1001, IdentityTransform(), "ts")),
+            "ts >= '2023-03-05T00:00:00+00:00'",
+        ),
+        (
+            "default.test_partitioned_by_years",
+            PartitionSpec(PartitionField(4, 1002, YearTransform(), "dt_year")),
+            "dt >= '2023-03-05'",
+        ),
+        (
+            "default.test_partitioned_by_months",
+            PartitionSpec(PartitionField(4, 1003, MonthTransform(), "dt_month")),
+            "dt >= '2023-03-05'",
+        ),
+        (
+            "default.test_partitioned_by_days",
+            PartitionSpec(PartitionField(3, 1004, DayTransform(), "ts_day")),
+            "ts >= '2023-03-05T00:00:00+00:00'",
+        ),
+        (
+            "default.test_partitioned_by_hours",
+            PartitionSpec(PartitionField(3, 1005, HourTransform(), "ts_hour")),
+            "ts >= '2023-03-05T00:00:00+00:00'",
+        ),
+        (
+            "default.test_partitioned_by_truncate",
+            PartitionSpec(PartitionField(2, 1006, TruncateTransform(1), "letter_trunc")),
+            "letter >= 'e'",
+        ),
+        (
+            "default.test_partitioned_by_bucket",
+            PartitionSpec(PartitionField(1, 1007, BucketTransform(8), "number_bucket")),
+            "number >= 5",
+        ),
+    ],
+)
+def test_partition_transform_pruning(spark, tmp_path, table_name, spec, predicate):
+    catalog = create_sql_catalog(tmp_path)
+    schema = _make_common_schema()
+    table = catalog.create_table(identifier=table_name, schema=schema, partition_spec=spec)
+    try:
+        _append_sample_data(table)
+        path = table.location()
+
+        df = spark.read.format("iceberg").load(path).filter(predicate).select("number")
+        result = set(r[0] for r in df.collect())
+        assert result == {5, 6, 7, 8, 9, 10, 11, 12}
+    finally:
+        catalog.drop_table(table_name)
+
+
diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_projection.py b/python/pysail/tests/spark/iceberg/test_iceberg_projection.py
new file mode 100644
index 0000000000..119ae96d0b
--- /dev/null
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_projection.py
@@ -0,0 +1,58 @@
+import pyarrow as pa
+
+from pyiceberg.schema import Schema
+from pyiceberg.types import IntegerType, NestedField, StringType, StructType
+
+from .utils import create_sql_catalog
+
+
+def test_column_projection_subset_and_order(spark, tmp_path):
+    catalog = create_sql_catalog(tmp_path)
+    identifier = "default.test_projection_subset"
+    schema = Schema(
+        NestedField(1, "a", IntegerType(), required=False),
+        NestedField(2, "b", StringType(), required=False),
+        NestedField(3, "c", IntegerType(), required=False),
+    )
+    table = catalog.create_table(identifier=identifier, schema=schema)
+    try:
+        table.append(pa.table({"a": pa.array([1, 2], type=pa.int32()), "b": pa.array(["x", "y"], type=pa.string()), "c": pa.array([3, 4], type=pa.int32())}))
+        path = table.location()
+
+        df = spark.read.format("iceberg").load(path).select("c", "a")
+        rows = sorted([(r[0], r[1]) for r in df.collect()])
+        assert rows == [(3, 1), (4, 2)]
+    finally:
+        catalog.drop_table(identifier)
+
+
+def test_nested_struct_projection_and_nulls(spark, tmp_path):
+    catalog = create_sql_catalog(tmp_path)
+    identifier = "default.test_projection_nested"
+    inner = StructType(NestedField(10, "x", IntegerType(), required=False), NestedField(11, "y", StringType(), required=False))
+    schema = Schema(
+        NestedField(1, "id", IntegerType(), required=False),
+        NestedField(2, "s", inner, required=False),
+    )
+    table = catalog.create_table(identifier=identifier, schema=schema)
+    try:
+        struct_type = pa.struct([("x", pa.int32()), ("y", pa.string())])
+        t1 = pa.Table.from_arrays([
+            pa.array([1], type=pa.int32()),
+            pa.array([None], type=struct_type),
+        ], names=["id", "s"])
+        t2 = pa.Table.from_arrays([
+            pa.array([2], type=pa.int32()),
+            pa.array([{"x": 7, "y": "z"}], type=struct_type),
+        ], names=["id", "s"])
+        table.append(t1)
+        table.append(t2)
+        path = table.location()
+
+        df = spark.read.format("iceberg").load(path).select("id", "s.x")
+        result = sorted([(r[0], r[1]) for r in df.collect()])
+        assert result == [(1, None), (2, 7)]
+    finally:
+        catalog.drop_table(identifier)
+
+
diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_reads.py b/python/pysail/tests/spark/iceberg/test_iceberg_reads.py
new file mode 100644
index 0000000000..0c6b12a8f9
--- /dev/null
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_reads.py
@@ -0,0 +1,125 @@
+import math
+
+import pyarrow as pa
+import pytest
+from pyiceberg.schema import Schema
+from pyiceberg.types import BooleanType, DoubleType, NestedField, StringType, TimestampType
+
+from .utils import create_sql_catalog
+
+
+@pytest.mark.parametrize("use_rewritten", [False, True])
+def test_nan_reads(spark, tmp_path, use_rewritten):
+    catalog = create_sql_catalog(tmp_path)
+    identifier = f"default.test_nan_reads_{'rewritten' if use_rewritten else 'orig'}"
+    table = catalog.create_table(
+        identifier=identifier,
+        schema=Schema(
+            NestedField(1, "idx", DoubleType(), required=False),
+            NestedField(2, "col_numeric", DoubleType(), required=False),
+        ),
+    )
+    try:
+        tbl = pa.table({"idx": [1.0, 2.0, 3.0], "col_numeric": [float("nan"), 2.0, 3.0]})
+        table.append(tbl)
+        path = table.location()
+        df = spark.read.format("iceberg").load(path).select("idx", "col_numeric").filter("isnan(col_numeric)")
+        rows = df.collect()
+        assert len(rows) == 1
+        assert int(rows[0][0]) == 1
+        assert math.isnan(rows[0][1])
+    finally:
+        catalog.drop_table(identifier)
+
+
+def test_datetime_filter_reads(spark, tmp_path):
+    from datetime import datetime, timedelta
+
+    catalog = create_sql_catalog(tmp_path)
+    identifier = "default.test_datetime_filter_reads"
+    table = catalog.create_table(
+        identifier=identifier,
+        schema=Schema(
+            NestedField(1, "str", StringType(), required=False),
+            NestedField(2, "datetime", TimestampType(), required=False),
+        ),
+    )
+    try:
+        yesterday = datetime.now() - timedelta(days=1)
+        tbl = pa.table({"str": ["foo"], "datetime": [yesterday]})
+        table.append(tbl)
+        path = table.location()
+        iso_ts = yesterday.isoformat()
+        df_ge = spark.read.format("iceberg").load(path).filter(f"datetime >= '{iso_ts}'")
+        assert df_ge.count() == 1
+        df_lt = spark.read.format("iceberg").load(path).filter(f"datetime < '{iso_ts}'")
+        assert df_lt.count() == 0
+    finally:
+        catalog.drop_table(identifier)
+
+
+def test_struct_null_filters(spark, tmp_path):
+    catalog = create_sql_catalog(tmp_path)
+    identifier = "default.test_struct_null_filters"
+    struct_field = pa.struct([("test", pa.int32())])
+    arrow_schema = pa.schema([pa.field("col_struct", struct_field)])
+    table = catalog.create_table(identifier=identifier, schema=arrow_schema)
+    try:
+        t1 = pa.Table.from_arrays([pa.array([None], type=struct_field)], schema=arrow_schema)
+        t2 = pa.Table.from_arrays([pa.array([{"test": 1}], type=struct_field)], schema=arrow_schema)
+        table.append(t1)
+        table.append(t2)
+        path = table.location()
+        df_all = spark.read.format("iceberg").load(path)
+        assert df_all.count() == 2
+        df_not_null = df_all.filter("col_struct.test is not null")
+        assert df_not_null.count() == 1
+        df_null = df_all.filter("col_struct.test is null")
+        assert df_null.count() == 1
+    finally:
+        catalog.drop_table(identifier)
+
+
+def test_limit_with_multiple_files(spark, tmp_path):
+    catalog = create_sql_catalog(tmp_path)
+    identifier = "default.test_limit_with_multiple_files"
+    table = catalog.create_table(
+        identifier=identifier,
+        schema=Schema(
+            NestedField(1, "id", StringType(), required=False),
+        ),
+    )
+    try:
+        tbl1 = pa.table({"id": ["a", "b", "c", "d", "e"]})
+        tbl2 = pa.table({"id": ["f", "g", "h", "i", "j"]})
+        table.append(tbl1)
+        table.append(tbl2)
+        path = table.location()
+        df = spark.read.format("iceberg").load(path).limit(3)
+        assert df.count() == 3
+    finally:
+        catalog.drop_table(identifier)
+
+
+
+def test_limit_with_filter(spark, tmp_path):
+    catalog = create_sql_catalog(tmp_path)
+    identifier = "default.test_limit_with_filter"
+    table = catalog.create_table(
+        identifier=identifier,
+        schema=Schema(
+            NestedField(1, "id", StringType(), required=False),
+            NestedField(2, "flag", BooleanType(), required=False),
+        ),
+    )
+    try:
+        tbl1 = pa.table({"id": ["a", "b", "c", "d", "e"], "flag": [True, False, True, True, False]})
+        tbl2 = pa.table({"id": ["f", "g", "h", "i", "j"], "flag": [False, True, False, True, True]})
+        table.append(tbl1)
+        table.append(tbl2)
+        path = table.location()
+        df = spark.read.format("iceberg").load(path).filter("flag = true").limit(3)
+        assert df.count() == 3
+    finally:
+        catalog.drop_table(identifier)
+
diff --git a/python/pysail/tests/spark/iceberg/utils.py b/python/pysail/tests/spark/iceberg/utils.py
new file mode 100644
index 0000000000..28903993bc
--- /dev/null
+++ b/python/pysail/tests/spark/iceberg/utils.py
@@ -0,0 +1,21 @@
+from pathlib import Path
+
+from pyiceberg.catalog import load_catalog
+
+
+def create_sql_catalog(tmp_path: Path):
+    warehouse_path = tmp_path / "warehouse"
+    warehouse_path.mkdir(parents=True, exist_ok=True)
+    catalog = load_catalog(
+        "test_catalog",
+        type="sql",
+        uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
+        warehouse=f"file://{warehouse_path}",
+    )
+    try:
+        catalog.create_namespace("default")
+    except Exception:
+        pass
+    return catalog
+
+

From 8550f8188f604f0a6d7cd9e3d01905ffb3cf7a37 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Wed, 15 Oct 2025 17:20:42 +0800
Subject: [PATCH 27/32] fmt

---
 .../iceberg/test_iceberg_partitioned_reads.py | 10 ++---
 .../spark/iceberg/test_iceberg_projection.py  | 41 ++++++++++++-------
 .../tests/spark/iceberg/test_iceberg_reads.py | 12 +++---
 python/pysail/tests/spark/iceberg/utils.py    |  6 +--
 4 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_partitioned_reads.py b/python/pysail/tests/spark/iceberg/test_iceberg_partitioned_reads.py
index ecc9a9011a..da7ef36740 100644
--- a/python/pysail/tests/spark/iceberg/test_iceberg_partitioned_reads.py
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_partitioned_reads.py
@@ -7,16 +7,16 @@
     DayTransform,
     HourTransform,
     IdentityTransform,
+    MonthTransform,
     PartitionField,
     PartitionSpec,
     TruncateTransform,
     YearTransform,
-    MonthTransform,
 )
 from pyiceberg.schema import Schema
 from pyiceberg.types import DateType, IntegerType, NestedField, StringType, TimestampType
 
-from .utils import create_sql_catalog
+from .utils import create_sql_catalog  # noqa: TID252
 
 
 def _make_common_schema():
@@ -70,7 +70,7 @@ def _append_sample_data(table):
 
 
 @pytest.mark.parametrize(
-    "table_name, spec, predicate",
+    "table_name, spec, predicate",  # noqa: PT006
     [
         (
             "default.test_partitioned_by_identity",
@@ -118,9 +118,7 @@ def test_partition_transform_pruning(spark, tmp_path, table_name, spec, predicat
         path = table.location()
 
         df = spark.read.format("iceberg").load(path).filter(predicate).select("number")
-        result = set(r[0] for r in df.collect())
+        result = set(r[0] for r in df.collect())  # noqa: C401
         assert result == {5, 6, 7, 8, 9, 10, 11, 12}
     finally:
         catalog.drop_table(table_name)
-
-
diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_projection.py b/python/pysail/tests/spark/iceberg/test_iceberg_projection.py
index 119ae96d0b..6d5047ae9a 100644
--- a/python/pysail/tests/spark/iceberg/test_iceberg_projection.py
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_projection.py
@@ -1,9 +1,8 @@
 import pyarrow as pa
-
 from pyiceberg.schema import Schema
 from pyiceberg.types import IntegerType, NestedField, StringType, StructType
 
-from .utils import create_sql_catalog
+from .utils import create_sql_catalog  # noqa: TID252
 
 
 def test_column_projection_subset_and_order(spark, tmp_path):
@@ -16,7 +15,15 @@ def test_column_projection_subset_and_order(spark, tmp_path):
     )
     table = catalog.create_table(identifier=identifier, schema=schema)
     try:
-        table.append(pa.table({"a": pa.array([1, 2], type=pa.int32()), "b": pa.array(["x", "y"], type=pa.string()), "c": pa.array([3, 4], type=pa.int32())}))
+        table.append(
+            pa.table(
+                {
+                    "a": pa.array([1, 2], type=pa.int32()),
+                    "b": pa.array(["x", "y"], type=pa.string()),
+                    "c": pa.array([3, 4], type=pa.int32()),
+                }
+            )
+        )
         path = table.location()
 
         df = spark.read.format("iceberg").load(path).select("c", "a")
@@ -29,7 +36,9 @@ def test_column_projection_subset_and_order(spark, tmp_path):
 def test_nested_struct_projection_and_nulls(spark, tmp_path):
     catalog = create_sql_catalog(tmp_path)
     identifier = "default.test_projection_nested"
-    inner = StructType(NestedField(10, "x", IntegerType(), required=False), NestedField(11, "y", StringType(), required=False))
+    inner = StructType(
+        NestedField(10, "x", IntegerType(), required=False), NestedField(11, "y", StringType(), required=False)
+    )
     schema = Schema(
         NestedField(1, "id", IntegerType(), required=False),
         NestedField(2, "s", inner, required=False),
@@ -37,14 +46,20 @@ def test_nested_struct_projection_and_nulls(spark, tmp_path):
     table = catalog.create_table(identifier=identifier, schema=schema)
     try:
         struct_type = pa.struct([("x", pa.int32()), ("y", pa.string())])
-        t1 = pa.Table.from_arrays([
-            pa.array([1], type=pa.int32()),
-            pa.array([None], type=struct_type),
-        ], names=["id", "s"])
-        t2 = pa.Table.from_arrays([
-            pa.array([2], type=pa.int32()),
-            pa.array([{"x": 7, "y": "z"}], type=struct_type),
-        ], names=["id", "s"])
+        t1 = pa.Table.from_arrays(
+            [
+                pa.array([1], type=pa.int32()),
+                pa.array([None], type=struct_type),
+            ],
+            names=["id", "s"],
+        )
+        t2 = pa.Table.from_arrays(
+            [
+                pa.array([2], type=pa.int32()),
+                pa.array([{"x": 7, "y": "z"}], type=struct_type),
+            ],
+            names=["id", "s"],
+        )
         table.append(t1)
         table.append(t2)
         path = table.location()
@@ -54,5 +69,3 @@ def test_nested_struct_projection_and_nulls(spark, tmp_path):
         assert result == [(1, None), (2, 7)]
     finally:
         catalog.drop_table(identifier)
-
-
diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_reads.py b/python/pysail/tests/spark/iceberg/test_iceberg_reads.py
index 0c6b12a8f9..ec00858c37 100644
--- a/python/pysail/tests/spark/iceberg/test_iceberg_reads.py
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_reads.py
@@ -5,7 +5,7 @@
 from pyiceberg.schema import Schema
 from pyiceberg.types import BooleanType, DoubleType, NestedField, StringType, TimestampType
 
-from .utils import create_sql_catalog
+from .utils import create_sql_catalog  # noqa: TID252
 
 
 @pytest.mark.parametrize("use_rewritten", [False, True])
@@ -45,7 +45,7 @@ def test_datetime_filter_reads(spark, tmp_path):
         ),
     )
     try:
-        yesterday = datetime.now() - timedelta(days=1)
+        yesterday = datetime.now() - timedelta(days=1)  # noqa: DTZ005
         tbl = pa.table({"str": ["foo"], "datetime": [yesterday]})
         table.append(tbl)
         path = table.location()
@@ -71,7 +71,7 @@ def test_struct_null_filters(spark, tmp_path):
         table.append(t2)
         path = table.location()
         df_all = spark.read.format("iceberg").load(path)
-        assert df_all.count() == 2
+        assert df_all.count() == 2  # noqa: PLR2004
         df_not_null = df_all.filter("col_struct.test is not null")
         assert df_not_null.count() == 1
         df_null = df_all.filter("col_struct.test is null")
@@ -96,12 +96,11 @@ def test_limit_with_multiple_files(spark, tmp_path):
         table.append(tbl2)
         path = table.location()
         df = spark.read.format("iceberg").load(path).limit(3)
-        assert df.count() == 3
+        assert df.count() == 3  # noqa: PLR2004
     finally:
         catalog.drop_table(identifier)
 
 
-
 def test_limit_with_filter(spark, tmp_path):
     catalog = create_sql_catalog(tmp_path)
     identifier = "default.test_limit_with_filter"
@@ -119,7 +118,6 @@ def test_limit_with_filter(spark, tmp_path):
         table.append(tbl2)
         path = table.location()
         df = spark.read.format("iceberg").load(path).filter("flag = true").limit(3)
-        assert df.count() == 3
+        assert df.count() == 3  # noqa: PLR2004
     finally:
         catalog.drop_table(identifier)
-
diff --git a/python/pysail/tests/spark/iceberg/utils.py b/python/pysail/tests/spark/iceberg/utils.py
index 28903993bc..6713cc446f 100644
--- a/python/pysail/tests/spark/iceberg/utils.py
+++ b/python/pysail/tests/spark/iceberg/utils.py
@@ -12,10 +12,8 @@ def create_sql_catalog(tmp_path: Path):
         uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
         warehouse=f"file://{warehouse_path}",
     )
-    try:
+    try:  # noqa: SIM105
         catalog.create_namespace("default")
-    except Exception:
+    except Exception:  # noqa: S110, BLE001
         pass
     return catalog
-
-

From 2ee47cddb3422c075f0cfea2fd1ac878119872d0 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Thu, 16 Oct 2025 15:02:54 +0800
Subject: [PATCH 28/32] update

---
 .../sail-iceberg/src/datasource/provider.rs   |  3 +-
 crates/sail-iceberg/src/datasource/pruning.rs | 35 ++-------
 .../sail-iceberg/src/spec/manifest/writer.rs  |  3 +-
 python/pysail/tests/spark/iceberg/conftest.py | 27 +++++++
 .../tests/spark/iceberg/test_iceberg_io.py    | 70 +++++-------------
 .../iceberg/test_iceberg_partitioned_reads.py | 60 ++++++++++-----
 .../spark/iceberg/test_iceberg_pruning.py     | 73 +++++++++----------
 7 files changed, 132 insertions(+), 139 deletions(-)
 create mode 100644 python/pysail/tests/spark/iceberg/conftest.py

diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
index a648c178d2..59e1655e67 100644
--- a/crates/sail-iceberg/src/datasource/provider.rs
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -161,7 +161,6 @@ impl IcebergTableProvider {
     ) -> DataFusionResult<Vec<DataFile>> {
         let mut data_files = Vec::new();
 
-        // Build partition spec map for summary pruning
         let spec_map: HashMap<i32, PartitionSpec> = self
             .partition_specs
             .iter()
@@ -171,7 +170,7 @@ impl IcebergTableProvider {
             prune_manifests_by_partition_summaries(manifest_list, &self.schema, &spec_map, filters);
 
         for manifest_file in manifest_files {
-            // TODO: Support delete manifests
+            // TODO: Add support for delete manifests
             if manifest_file.content != ManifestContentType::Data {
                 continue;
             }
diff --git a/crates/sail-iceberg/src/datasource/pruning.rs b/crates/sail-iceberg/src/datasource/pruning.rs
index d77aa9c33f..dc244b6d49 100644
--- a/crates/sail-iceberg/src/datasource/pruning.rs
+++ b/crates/sail-iceberg/src/datasource/pruning.rs
@@ -15,12 +15,12 @@ use crate::spec::partition::PartitionSpec;
 use crate::spec::types::values::{Datum, Literal};
 use crate::spec::types::{PrimitiveType, Type};
 use crate::spec::{DataFile, Manifest, ManifestContentType, ManifestList, Schema};
-// TODO: Consider parsing logical expressions more robustly for summary pruning
+// TODO: Implement robust logical expression parsing for summary pruning
 
 pub(crate) fn literal_to_scalar_value_local(
     literal: &Literal,
 ) -> datafusion::common::scalar::ScalarValue {
-    // TODO: Extend conversion to cover Decimal/UUID/Fixed with precise semantics and timezones
+    // TODO: Add Decimal/UUID/Fixed conversion with precise semantics and timezones
     match literal {
         Literal::Primitive(p) => match p {
             crate::spec::types::values::PrimitiveLiteral::Boolean(v) => {
@@ -121,10 +121,8 @@ impl IcebergPruningStats {
         datum: &Datum,
     ) -> datafusion::common::scalar::ScalarValue {
         use datafusion::common::scalar::ScalarValue as SV;
-        // Convert according to the Iceberg field primitive type to match Arrow schema
         match self.field_id_to_type.get(&field_id) {
             Some(PrimitiveType::Date) => {
-                // Iceberg encodes date as days (i32)
                 if let crate::spec::types::values::PrimitiveLiteral::Int(v) = datum.literal {
                     SV::Date32(Some(v))
                 } else {
@@ -132,7 +130,6 @@ impl IcebergPruningStats {
                 }
             }
             Some(PrimitiveType::Timestamp | PrimitiveType::TimestampNs) => {
-                // Iceberg encodes timestamp without tz as microseconds (long)
                 if let crate::spec::types::values::PrimitiveLiteral::Long(v) = datum.literal {
                     SV::TimestampMicrosecond(Some(v), None)
                 } else {
@@ -141,7 +138,6 @@ impl IcebergPruningStats {
             }
             Some(PrimitiveType::Timestamptz | PrimitiveType::TimestamptzNs) => {
                 if let crate::spec::types::values::PrimitiveLiteral::Long(v) = datum.literal {
-                    // Use UTC; Arrow keeps tz as string label
                     SV::TimestampMicrosecond(Some(v), Some(std::sync::Arc::from("UTC")))
                 } else {
                     literal_to_scalar_value_local(&Literal::Primitive(datum.literal.clone()))
@@ -149,7 +145,6 @@ impl IcebergPruningStats {
             }
             Some(PrimitiveType::Time) => {
                 if let crate::spec::types::values::PrimitiveLiteral::Long(v) = datum.literal {
-                    // Iceberg time is microseconds from midnight
                     SV::Time64Microsecond(Some(v))
                 } else {
                     literal_to_scalar_value_local(&Literal::Primitive(datum.literal.clone()))
@@ -230,8 +225,6 @@ impl PruningStatistics for IcebergPruningStats {
         _column: &Column,
         _value: &std::collections::HashSet<datafusion::common::scalar::ScalarValue>,
     ) -> Option<BooleanArray> {
-        // Basic contained() for equality/IN pruning using lower/upper bounds equality for strings and integers
-        // When both bounds are equal to the value, we can mark "contained" true; otherwise, unknown
         let field_id = self.field_id_for(_column)?;
         let mut result = Vec::with_capacity(self.files.len());
         for f in &self.files {
@@ -312,7 +305,7 @@ pub fn prune_manifests_by_partition_summaries<'a>(
     partition_specs: &std::collections::HashMap<i32, PartitionSpec>,
     filters: &[Expr],
 ) -> Vec<&'a crate::spec::manifest_list::ManifestFile> {
-    // TODO: Support non-identity transforms (day/month/hour/bucket/truncate)
+    // TODO: Add support for non-identity transforms (day/month/hour/bucket/truncate)
     let eq_filters = collect_identity_eq_filters(table_schema, filters);
     let in_filters = collect_identity_in_filters(table_schema, filters);
     let range_filters = collect_identity_range_filters(table_schema, filters);
@@ -321,7 +314,6 @@ pub fn prune_manifests_by_partition_summaries<'a>(
         .iter()
         .filter(|mf| mf.content == ManifestContentType::Data)
         .filter(|mf| {
-            // If no simple identity eq filters, keep manifest
             if eq_filters.is_empty() {
                 return true;
             }
@@ -331,20 +323,16 @@ pub fn prune_manifests_by_partition_summaries<'a>(
             let Some(part_summaries) = mf.partitions.as_ref() else {
                 return true;
             };
-            // Build partition field result types
             let part_type = match spec.partition_type(table_schema) {
                 Ok(t) => t,
                 Err(_) => return true,
             };
-            // Evaluate equality filters; if any contradicts summaries, drop manifest
             for (source_id, lit) in &eq_filters {
-                // find partition field with identity transform sourcing this column
                 if let Some((idx, _pf)) = spec.fields().iter().enumerate().find(|(_, pf)| {
                     pf.source_id == *source_id
                         && matches!(pf.transform, crate::spec::transform::Transform::Identity)
                 }) {
                     if let Some(summary) = part_summaries.get(idx) {
-                        // decode bounds according to partition field type
                         let field_ty = part_type.fields().get(idx).map(|nf| nf.field_type.as_ref());
                         if let Some(Type::Primitive(prim_ty)) = field_ty {
                             // TODO: Handle contains_null/contains_nan from FieldSummary
@@ -358,17 +346,14 @@ pub fn prune_manifests_by_partition_summaries<'a>(
                                 .and_then(|b| decode_bound_bytes(prim_ty, b).ok());
 
                             if let (Some(lb), Some(ub)) = (lower.as_ref(), upper.as_ref()) {
-                                // if lit < lb or lit > ub, cannot match
                                 if lt_prim(lit, lb) || gt_prim(lit, ub) {
                                     return false;
                                 }
                             } else if let Some(lb) = lower.as_ref() {
-                                // if we have only a lower bound, drop manifest if lit < lb
                                 if lt_prim(lit, lb) {
                                     return false;
                                 }
                             } else if let Some(ub) = upper.as_ref() {
-                                // if we have only an upper bound, drop manifest if lit > ub
                                 if gt_prim(lit, ub) {
                                     return false;
                                 }
@@ -378,7 +363,6 @@ pub fn prune_manifests_by_partition_summaries<'a>(
                 }
             }
 
-            // Evaluate IN-list filters: require intersection with [lb, ub]
             for (source_id, lits) in &in_filters {
                 if let Some((idx, _pf)) = spec.fields().iter().enumerate().find(|(_, pf)| {
                     pf.source_id == *source_id
@@ -407,13 +391,11 @@ pub fn prune_manifests_by_partition_summaries<'a>(
                                     return false;
                                 }
                             } else if let Some(lb) = lower.as_ref() {
-                                // with only lower bound, require any value >= lb
                                 let any_in = lits.iter().any(|v| !lt_prim(v, lb));
                                 if !any_in {
                                     return false;
                                 }
                             } else if let Some(ub) = upper.as_ref() {
-                                // with only upper bound, require any value <= ub
                                 let any_in = lits.iter().any(|v| !gt_prim(v, ub));
                                 if !any_in {
                                     return false;
@@ -424,7 +406,6 @@ pub fn prune_manifests_by_partition_summaries<'a>(
                 }
             }
 
-            // Evaluate simple range filters: require overlap with [lb, ub]
             for (source_id, range) in &range_filters {
                 if let Some((idx, _pf)) = spec.fields().iter().enumerate().find(|(_, pf)| {
                     pf.source_id == *source_id
@@ -442,8 +423,6 @@ pub fn prune_manifests_by_partition_summaries<'a>(
                                 .as_ref()
                                 .and_then(|b| decode_bound_bytes(prim_ty, b).ok());
                             if let (Some(lb), Some(ub)) = (lower.as_ref(), upper.as_ref()) {
-                                // manifest range [lb, ub]
-                                // query range [min, max]
                                 if let Some((ref qmin, _incl_min)) = range.min {
                                     if gt_prim(qmin, ub) {
                                         return false;
@@ -470,7 +449,7 @@ pub fn prune_manifest_entries(
     _schema: &Schema,
     _filters: &[Expr],
 ) -> Vec<DataFile> {
-    // TODO: Partition-transform awareness and metrics-only prune at manifest entry granularity
+    // TODO: Add partition-transform awareness and metrics-only pruning at manifest entry granularity
     manifest
         .entries()
         .iter()
@@ -489,7 +468,6 @@ fn collect_identity_eq_filters(
     schema: &Schema,
     filters: &[Expr],
 ) -> Vec<(i32, crate::spec::types::values::PrimitiveLiteral)> {
-    // returns Vec of (source_id, literal) for Exprs of form col = literal
     fn strip(expr: &Expr) -> &Expr {
         match expr {
             Expr::Cast(c) => strip(&c.expr),
@@ -764,10 +742,7 @@ fn decode_bound_bytes(
         PrimitiveType::Uuid => {
             return Err("uuid bound decoding not supported".to_string());
         }
-        PrimitiveType::Fixed(_) | PrimitiveType::Binary => {
-            // Treat bounds as raw bytes for conservative comparisons (equality filters only)
-            PL::Binary(bytes.to_vec())
-        }
+        PrimitiveType::Fixed(_) | PrimitiveType::Binary => PL::Binary(bytes.to_vec()),
         PrimitiveType::Decimal { .. } => {
             return Err("decimal bound decoding not supported".to_string());
         }
diff --git a/crates/sail-iceberg/src/spec/manifest/writer.rs b/crates/sail-iceberg/src/spec/manifest/writer.rs
index cabca8869b..00e99901ae 100644
--- a/crates/sail-iceberg/src/spec/manifest/writer.rs
+++ b/crates/sail-iceberg/src/spec/manifest/writer.rs
@@ -17,8 +17,7 @@
 
 // [CREDIT]: https://raw.githubusercontent.com/apache/iceberg-rust/dc349284a4204c1a56af47fb3177ace6f9e899a0/crates/iceberg/src/spec/manifest/writer.rs
 
-// Awareness stub for non-read path
-// TODO: Implement writer support if/when write path is added.
+// TODO: Implement manifest writer
 #[allow(dead_code)]
 #[derive(Debug, Clone)]
 pub struct ManifestWriter;
diff --git a/python/pysail/tests/spark/iceberg/conftest.py b/python/pysail/tests/spark/iceberg/conftest.py
new file mode 100644
index 0000000000..396117cbe5
--- /dev/null
+++ b/python/pysail/tests/spark/iceberg/conftest.py
@@ -0,0 +1,27 @@
+from typing import TYPE_CHECKING
+
+import pytest
+from pyiceberg.table import Table
+
+from pysail.tests.spark.iceberg.utils import create_sql_catalog
+
+if TYPE_CHECKING:
+    from pyiceberg.schema import Schema
+
+
+@pytest.fixture
+def sql_catalog(tmp_path):
+    return create_sql_catalog(tmp_path)
+
+
+@pytest.fixture
+def iceberg_table(sql_catalog, request) -> Table:
+    params = request.param
+    schema: Schema = params["schema"]
+    identifier: str = params["identifier"]
+    partition_spec = params.get("partition_spec")
+    table = sql_catalog.create_table(identifier=identifier, schema=schema, partition_spec=partition_spec)
+    try:
+        yield table
+    finally:
+        sql_catalog.drop_table(identifier)
diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_io.py b/python/pysail/tests/spark/iceberg/test_iceberg_io.py
index 0df234e9bc..9a32254bb7 100644
--- a/python/pysail/tests/spark/iceberg/test_iceberg_io.py
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_io.py
@@ -2,7 +2,6 @@
 import pyarrow as pa
 import pytest
 from pandas.testing import assert_frame_equal
-from pyiceberg.catalog import load_catalog
 from pyiceberg.schema import Schema
 from pyiceberg.types import DoubleType, LongType, NestedField, StringType
 
@@ -18,32 +17,23 @@ def iceberg_test_data():
 
 @pytest.fixture
 def expected_pandas_df():
-    return pd.DataFrame({"id": [10, 11, 12], "event": ["A", "B", "A"], "score": [0.98, 0.54, 0.76]}).astype(
-        {"id": "int64", "event": "string", "score": "float64"}
+    return (
+        pd.DataFrame({"id": [10, 11, 12], "event": ["A", "B", "A"], "score": [0.98, 0.54, 0.76]})
+        .astype({"id": "int64", "score": "float64"})
+        .assign(event=lambda df: df["event"].astype("object"))
     )
 
 
-def test_iceberg_io_basic_read(spark, iceberg_test_data, expected_pandas_df, tmp_path):
-    warehouse_path = tmp_path / "warehouse"
-    warehouse_path.mkdir()
+def test_iceberg_io_basic_read(spark, iceberg_test_data, expected_pandas_df, sql_catalog):
     table_name = "test_table"
 
-    catalog = load_catalog(
-        "test_catalog",
-        type="sql",
-        uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
-        warehouse=f"file://{warehouse_path}",
-    )
-
-    catalog.create_namespace("default")
-
     schema = Schema(
         NestedField(field_id=1, name="id", field_type=LongType(), required=False),
         NestedField(field_id=2, name="event", field_type=StringType(), required=False),
         NestedField(field_id=3, name="score", field_type=DoubleType(), required=False),
     )
 
-    table = catalog.create_table(
+    table = sql_catalog.create_table(
         identifier=f"default.{table_name}",
         schema=schema,
     )
@@ -58,33 +48,22 @@ def test_iceberg_io_basic_read(spark, iceberg_test_data, expected_pandas_df, tmp
         result_df = spark.read.format("iceberg").load(table_path).sort("id")
 
         assert_frame_equal(
-            result_df.toPandas(), expected_pandas_df.sort_values("id").reset_index(drop=True), check_dtype=False
+            result_df.toPandas(), expected_pandas_df.sort_values("id").reset_index(drop=True), check_dtype=True
         )
     finally:
-        catalog.drop_table(f"default.{table_name}")
+        sql_catalog.drop_table(f"default.{table_name}")
 
 
-def test_iceberg_io_read_with_sql(spark, iceberg_test_data, expected_pandas_df, tmp_path):
-    warehouse_path = tmp_path / "warehouse"
-    warehouse_path.mkdir()
+def test_iceberg_io_read_with_sql(spark, iceberg_test_data, expected_pandas_df, sql_catalog):
     table_name = "test_table_sql"
 
-    catalog = load_catalog(
-        "test_catalog",
-        type="sql",
-        uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
-        warehouse=f"file://{warehouse_path}",
-    )
-
-    catalog.create_namespace("default")
-
     schema = Schema(
         NestedField(field_id=1, name="id", field_type=LongType(), required=False),
         NestedField(field_id=2, name="event", field_type=StringType(), required=False),
         NestedField(field_id=3, name="score", field_type=DoubleType(), required=False),
     )
 
-    table = catalog.create_table(
+    table = sql_catalog.create_table(
         identifier=f"default.{table_name}",
         schema=schema,
     )
@@ -102,34 +81,23 @@ def test_iceberg_io_read_with_sql(spark, iceberg_test_data, expected_pandas_df,
             result_df = spark.sql("SELECT * FROM my_iceberg").sort("id")
 
             assert_frame_equal(
-                result_df.toPandas(), expected_pandas_df.sort_values("id").reset_index(drop=True), check_dtype=False
+                result_df.toPandas(), expected_pandas_df.sort_values("id").reset_index(drop=True), check_dtype=True
             )
         finally:
             spark.sql("DROP TABLE IF EXISTS my_iceberg")
     finally:
-        catalog.drop_table(f"default.{table_name}")
+        sql_catalog.drop_table(f"default.{table_name}")
 
 
-def test_iceberg_io_multiple_files(spark, tmp_path):
-    warehouse_path = tmp_path / "warehouse"
-    warehouse_path.mkdir()
+def test_iceberg_io_multiple_files(spark, sql_catalog):
     table_name = "test_table_multiple"
 
-    catalog = load_catalog(
-        "test_catalog",
-        type="sql",
-        uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
-        warehouse=f"file://{warehouse_path}",
-    )
-
-    catalog.create_namespace("default")
-
     schema = Schema(
         NestedField(field_id=1, name="id", field_type=LongType(), required=False),
         NestedField(field_id=2, name="value", field_type=StringType(), required=False),
     )
 
-    table = catalog.create_table(
+    table = sql_catalog.create_table(
         identifier=f"default.{table_name}",
         schema=schema,
     )
@@ -147,14 +115,16 @@ def test_iceberg_io_multiple_files(spark, tmp_path):
 
         result_df = spark.read.format("iceberg").load(table_path).sort("id")
 
-        expected_data = pd.DataFrame({"id": [1, 2, 3, 4], "value": ["a", "b", "c", "d"]}).astype(
-            {"id": "int64", "value": "string"}
+        expected_data = (
+            pd.DataFrame({"id": [1, 2, 3, 4], "value": ["a", "b", "c", "d"]})
+            .astype({"id": "int64"})
+            .assign(value=lambda df: df["value"].astype("object"))
         )
 
         assert_frame_equal(
-            result_df.toPandas(), expected_data.sort_values("id").reset_index(drop=True), check_dtype=False
+            result_df.toPandas(), expected_data.sort_values("id").reset_index(drop=True), check_dtype=True
         )
 
         assert result_df.count() == 4  # noqa: PLR2004
     finally:
-        catalog.drop_table(f"default.{table_name}")
+        sql_catalog.drop_table(f"default.{table_name}")
diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_partitioned_reads.py b/python/pysail/tests/spark/iceberg/test_iceberg_partitioned_reads.py
index da7ef36740..911ff6bca0 100644
--- a/python/pysail/tests/spark/iceberg/test_iceberg_partitioned_reads.py
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_partitioned_reads.py
@@ -70,46 +70,69 @@ def _append_sample_data(table):
 
 
 @pytest.mark.parametrize(
-    "table_name, spec, predicate",  # noqa: PT006
+    ("table_name", "spec", "predicate_column", "predicate_value", "expected_numbers"),
     [
-        (
+        pytest.param(
             "default.test_partitioned_by_identity",
             PartitionSpec(PartitionField(3, 1001, IdentityTransform(), "ts")),
-            "ts >= '2023-03-05T00:00:00+00:00'",
+            "ts",
+            "'2023-03-05T00:00:00+00:00'",
+            {5, 6, 7, 8, 9, 10, 11, 12},
+            id="partition-by-identity",
         ),
-        (
+        pytest.param(
             "default.test_partitioned_by_years",
             PartitionSpec(PartitionField(4, 1002, YearTransform(), "dt_year")),
-            "dt >= '2023-03-05'",
+            "dt",
+            "'2023-03-05'",
+            {5, 6, 7, 8, 9, 10, 11, 12},
+            id="partition-by-years",
         ),
-        (
+        pytest.param(
             "default.test_partitioned_by_months",
             PartitionSpec(PartitionField(4, 1003, MonthTransform(), "dt_month")),
-            "dt >= '2023-03-05'",
+            "dt",
+            "'2023-03-05'",
+            {5, 6, 7, 8, 9, 10, 11, 12},
+            id="partition-by-months",
         ),
-        (
+        pytest.param(
             "default.test_partitioned_by_days",
             PartitionSpec(PartitionField(3, 1004, DayTransform(), "ts_day")),
-            "ts >= '2023-03-05T00:00:00+00:00'",
+            "ts",
+            "'2023-03-05T00:00:00+00:00'",
+            {5, 6, 7, 8, 9, 10, 11, 12},
+            id="partition-by-days",
         ),
-        (
+        pytest.param(
             "default.test_partitioned_by_hours",
             PartitionSpec(PartitionField(3, 1005, HourTransform(), "ts_hour")),
-            "ts >= '2023-03-05T00:00:00+00:00'",
+            "ts",
+            "'2023-03-05T00:00:00+00:00'",
+            {5, 6, 7, 8, 9, 10, 11, 12},
+            id="partition-by-hours",
         ),
-        (
+        pytest.param(
             "default.test_partitioned_by_truncate",
             PartitionSpec(PartitionField(2, 1006, TruncateTransform(1), "letter_trunc")),
-            "letter >= 'e'",
+            "letter",
+            "'e'",
+            {5, 6, 7, 8, 9, 10, 11, 12},
+            id="partition-by-truncate",
         ),
-        (
+        pytest.param(
             "default.test_partitioned_by_bucket",
             PartitionSpec(PartitionField(1, 1007, BucketTransform(8), "number_bucket")),
-            "number >= 5",
+            "number",
+            "5",
+            {5, 6, 7, 8, 9, 10, 11, 12},
+            id="partition-by-bucket",
         ),
     ],
 )
-def test_partition_transform_pruning(spark, tmp_path, table_name, spec, predicate):
+def test_partition_transform_pruning(
+    spark, tmp_path, table_name, spec, predicate_column, predicate_value, expected_numbers
+):
     catalog = create_sql_catalog(tmp_path)
     schema = _make_common_schema()
     table = catalog.create_table(identifier=table_name, schema=schema, partition_spec=spec)
@@ -117,8 +140,9 @@ def test_partition_transform_pruning(spark, tmp_path, table_name, spec, predicat
         _append_sample_data(table)
         path = table.location()
 
+        predicate = f"{predicate_column} >= {predicate_value}"
         df = spark.read.format("iceberg").load(path).filter(predicate).select("number")
-        result = set(r[0] for r in df.collect())  # noqa: C401
-        assert result == {5, 6, 7, 8, 9, 10, 11, 12}
+        result = {r[0] for r in df.collect()}
+        assert result == expected_numbers
     finally:
         catalog.drop_table(table_name)
diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py b/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py
index e86ba89c33..cf1ed5b0e4 100644
--- a/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_pruning.py
@@ -1,28 +1,14 @@
 # ruff: noqa
 import pandas as pd
 import pyarrow as pa
-from pyiceberg.catalog import load_catalog
 from pyiceberg.schema import Schema
 from pyiceberg.types import BooleanType, DoubleType, IntegerType, LongType, NestedField, StringType
+from .utils import create_sql_catalog
 
 
-def _create_catalog(tmp_path):
-    warehouse_path = tmp_path / "warehouse"
-    warehouse_path.mkdir()
-    catalog = load_catalog(
-        "test_catalog",
-        type="sql",
-        uri=f"sqlite:///{tmp_path}/pyiceberg_catalog.db",
-        warehouse=f"file://{warehouse_path}",
-    )
-    catalog.create_namespace("default")
-    return catalog
-
-
-def test_equality_and_in(spark, tmp_path):
-    catalog = _create_catalog(tmp_path)
+def _make_eq_in_table(catalog, ident):
     table = catalog.create_table(
-        identifier="default.prune_eq_in",
+        identifier=ident,
         schema=Schema(
             NestedField(field_id=1, name="id", field_type=LongType(), required=False),
             NestedField(field_id=2, name="year", field_type=IntegerType(), required=False),
@@ -30,33 +16,46 @@ def test_equality_and_in(spark, tmp_path):
             NestedField(field_id=4, name="value", field_type=StringType(), required=False),
         ),
     )
+    batches = [
+        pd.DataFrame({"id": [1, 2], "year": [2023, 2023], "month": [1, 1], "value": ["a", "b"]}),
+        pd.DataFrame({"id": [3, 4], "year": [2023, 2023], "month": [2, 2], "value": ["c", "d"]}),
+        pd.DataFrame({"id": [5, 6], "year": [2024, 2024], "month": [1, 1], "value": ["e", "f"]}),
+        pd.DataFrame({"id": [7, 8], "year": [2024, 2024], "month": [2, 2], "value": ["g", "h"]}),
+    ]
+    for df in batches:
+        df = df.astype({"id": "int64", "year": "int32", "month": "int32"})
+        table.append(pa.Table.from_pandas(df))
+    return table
+
+
+def test_pruning_equality_filters(spark, tmp_path):
+    catalog = create_sql_catalog(tmp_path)
+    ident = "default.prune_eq_only"
+    table = _make_eq_in_table(catalog, ident)
     try:
-        batches = [
-            pd.DataFrame({"id": [1, 2], "year": [2023, 2023], "month": [1, 1], "value": ["a", "b"]}),
-            pd.DataFrame({"id": [3, 4], "year": [2023, 2023], "month": [2, 2], "value": ["c", "d"]}),
-            pd.DataFrame({"id": [5, 6], "year": [2024, 2024], "month": [1, 1], "value": ["e", "f"]}),
-            pd.DataFrame({"id": [7, 8], "year": [2024, 2024], "month": [2, 2], "value": ["g", "h"]}),
-        ]
-        for df in batches:
-            df = df.astype({"id": "int64", "year": "int32", "month": "int32"})
-            table.append(pa.Table.from_pandas(df))
-
         tp = table.location()
-
         df = spark.read.format("iceberg").load(tp).filter("year = 2023")
         assert df.count() == 4
-
         df = spark.read.format("iceberg").load(tp).filter("year = 2023 AND month = 1")
         assert df.count() == 2
+    finally:
+        catalog.drop_table(ident)
+
 
+def test_pruning_in_clause(spark, tmp_path):
+    catalog = create_sql_catalog(tmp_path)
+    ident = "default.prune_in_only"
+    table = _make_eq_in_table(catalog, ident)
+    try:
+        tp = table.location()
         df = spark.read.format("iceberg").load(tp).filter("month IN (2)")
         assert df.count() == 4
     finally:
-        catalog.drop_table("default.prune_eq_in")
+        catalog.drop_table(ident)
 
 
 def test_comparison_and_between(spark, tmp_path):
-    catalog = _create_catalog(tmp_path)
+    catalog = create_sql_catalog(tmp_path)
     table = catalog.create_table(
         identifier="default.prune_cmp",
         schema=Schema(
@@ -89,7 +88,7 @@ def test_comparison_and_between(spark, tmp_path):
 
 
 def test_null_and_boolean(spark, tmp_path):
-    catalog = _create_catalog(tmp_path)
+    catalog = create_sql_catalog(tmp_path)
     table = catalog.create_table(
         identifier="default.prune_null_bool",
         schema=Schema(
@@ -122,7 +121,7 @@ def test_null_and_boolean(spark, tmp_path):
 
 
 def test_correctness_small(spark, tmp_path):
-    catalog = _create_catalog(tmp_path)
+    catalog = create_sql_catalog(tmp_path)
     table = catalog.create_table(
         identifier="default.prune_correct",
         schema=Schema(
@@ -159,7 +158,7 @@ def test_correctness_small(spark, tmp_path):
 
 
 def test_or_and_not_pruning(spark, tmp_path):
-    catalog = _create_catalog(tmp_path)
+    catalog = create_sql_catalog(tmp_path)
     table = catalog.create_table(
         identifier="default.prune_or_and_not",
         schema=Schema(
@@ -193,7 +192,7 @@ def test_or_and_not_pruning(spark, tmp_path):
 
 
 def test_string_in_and_range_pruning(spark, tmp_path):
-    catalog = _create_catalog(tmp_path)
+    catalog = create_sql_catalog(tmp_path)
     table = catalog.create_table(
         identifier="default.prune_string_in_range",
         schema=Schema(
@@ -223,7 +222,7 @@ def test_string_in_and_range_pruning(spark, tmp_path):
 
 
 def test_metrics_based_pruning_numeric(spark, tmp_path):
-    catalog = _create_catalog(tmp_path)
+    catalog = create_sql_catalog(tmp_path)
     table = catalog.create_table(
         identifier="default.prune_metrics_numeric",
         schema=Schema(
@@ -251,7 +250,7 @@ def test_metrics_based_pruning_numeric(spark, tmp_path):
 
 
 def test_limit_pushdown_behavior(spark, tmp_path):
-    catalog = _create_catalog(tmp_path)
+    catalog = create_sql_catalog(tmp_path)
     table = catalog.create_table(
         identifier="default.prune_limit",
         schema=Schema(

From 64de0d1b876fc7423cc69550c10121d5374b6c55 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Thu, 16 Oct 2025 16:20:20 +0800
Subject: [PATCH 29/32] remove pyiceberg-core

---
 pyproject.toml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8d97f831b3..1c19821386 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ test = [
     "duckdb>=1.0,<2",
     "pytest>=8.4,<9",
     "pillow>=10.3.0",
-    "pyiceberg[sql-sqlite,pyiceberg-core]==0.10.0",
+    "pyiceberg[sql-sqlite]==0.10.0",
     "pydantic>=2.7,<2.8",
 ]
 mcp = [
@@ -78,8 +78,8 @@ dependencies = [
     "mcp>=1.0,<2",
     "boto3>=1.38,<2",
     "pillow>=10.3.0",
-    "pyiceberg[sql-sqlite,pyiceberg-core]==0.10.0",
-  "pydantic>=2.7,<2.8",
+    "pyiceberg[sql-sqlite]==0.10.0",
+    "pydantic>=2.7,<2.8",
 ]
 path = ".venvs/default"
 
@@ -118,7 +118,7 @@ dependencies = [
     "pytest>=8.4,<9",
     "duckdb>=1.1,<2",
     "pillow>=10.3.0",
-    "pyiceberg[sql-sqlite,pyiceberg-core]==0.10.0",
+    "pyiceberg[sql-sqlite]==0.10.0",
     "pydantic>=2.7,<2.8",
 ]
 
@@ -146,7 +146,7 @@ dependencies = [
     "pytest-xdist>=3.7,<4",
     "pytest-timeout>=2.4,<3",
     "pytest-reportlog>=0.4,<0.5",
-    "pyiceberg[sql-sqlite,pyiceberg-core]==0.10.0",
+    "pyiceberg[sql-sqlite]==0.10.0",
     "pydantic>=2.7,<2.8",
 ]
 

From a0bab5c46410e49fb3b6bb4760c143795ae40c7e Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Thu, 16 Oct 2025 16:23:21 +0800
Subject: [PATCH 30/32] pin pydantic

---
 pyproject.toml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1c19821386..7bb6a95982 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ test = [
     "pytest>=8.4,<9",
     "pillow>=10.3.0",
     "pyiceberg[sql-sqlite]==0.10.0",
-    "pydantic>=2.7,<2.8",
+    "pydantic>=2.11,<2.12",
 ]
 mcp = [
     "mcp>=1.0.0,<2",
@@ -79,7 +79,7 @@ dependencies = [
     "boto3>=1.38,<2",
     "pillow>=10.3.0",
     "pyiceberg[sql-sqlite]==0.10.0",
-    "pydantic>=2.7,<2.8",
+    "pydantic>=2.11,<2.12",
 ]
 path = ".venvs/default"
 
@@ -119,7 +119,7 @@ dependencies = [
     "duckdb>=1.1,<2",
     "pillow>=10.3.0",
     "pyiceberg[sql-sqlite]==0.10.0",
-    "pydantic>=2.7,<2.8",
+    "pydantic>=2.11,<2.12",
 ]
 
 [[tool.hatch.envs.test.matrix]]
@@ -147,7 +147,7 @@ dependencies = [
     "pytest-timeout>=2.4,<3",
     "pytest-reportlog>=0.4,<0.5",
     "pyiceberg[sql-sqlite]==0.10.0",
-    "pydantic>=2.7,<2.8",
+    "pydantic>=2.11,<2.12",
 ]
 
 [[tool.hatch.envs.test-spark.matrix]]

From 49755f2750bf31df0794068500b5a3930c543e01 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Thu, 16 Oct 2025 16:34:41 +0800
Subject: [PATCH 31/32] add iceberg-core

---
 pyproject.toml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7bb6a95982..4b8972d2fe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ test = [
     "duckdb>=1.0,<2",
     "pytest>=8.4,<9",
     "pillow>=10.3.0",
-    "pyiceberg[sql-sqlite]==0.10.0",
+    "pyiceberg[sql-sqlite,pyiceberg-core]==0.10.0",
     "pydantic>=2.11,<2.12",
 ]
 mcp = [
@@ -78,7 +78,7 @@ dependencies = [
     "mcp>=1.0,<2",
     "boto3>=1.38,<2",
     "pillow>=10.3.0",
-    "pyiceberg[sql-sqlite]==0.10.0",
+    "pyiceberg[sql-sqlite,pyiceberg-core]==0.10.0",
     "pydantic>=2.11,<2.12",
 ]
 path = ".venvs/default"
@@ -118,7 +118,7 @@ dependencies = [
     "pytest>=8.4,<9",
     "duckdb>=1.1,<2",
     "pillow>=10.3.0",
-    "pyiceberg[sql-sqlite]==0.10.0",
+    "pyiceberg[sql-sqlite,pyiceberg-core]==0.10.0",
     "pydantic>=2.11,<2.12",
 ]
 
@@ -146,7 +146,7 @@ dependencies = [
     "pytest-xdist>=3.7,<4",
     "pytest-timeout>=2.4,<3",
     "pytest-reportlog>=0.4,<0.5",
-    "pyiceberg[sql-sqlite]==0.10.0",
+    "pyiceberg[sql-sqlite,pyiceberg-core]==0.10.0",
     "pydantic>=2.11,<2.12",
 ]
 

From 2e63cf97141accafb7ae447167755e2c80b0e833 Mon Sep 17 00:00:00 2001
From: xiaolong <xiaolong@lakesail.com>
Date: Thu, 16 Oct 2025 21:40:34 +0800
Subject: [PATCH 32/32] address comments

---
 crates/sail-iceberg/src/datasource/mod.rs     | 37 +++++++++++
 .../sail-iceberg/src/datasource/provider.rs   | 48 ++------------
 crates/sail-iceberg/src/datasource/pruning.rs | 64 ++-----------------
 .../tests/spark/iceberg/test_iceberg_io.py    |  4 +-
 .../tests/spark/iceberg/test_iceberg_reads.py |  6 +-
 5 files changed, 55 insertions(+), 104 deletions(-)

diff --git a/crates/sail-iceberg/src/datasource/mod.rs b/crates/sail-iceberg/src/datasource/mod.rs
index ce0fd04194..4508e8a8d2 100644
--- a/crates/sail-iceberg/src/datasource/mod.rs
+++ b/crates/sail-iceberg/src/datasource/mod.rs
@@ -3,4 +3,41 @@ pub mod expressions;
 pub mod provider;
 pub mod pruning;
 
+use datafusion::common::scalar::ScalarValue;
 pub use provider::*;
+
+use crate::spec::types::values::{Literal, PrimitiveLiteral};
+
+pub(crate) fn literal_to_scalar_value(literal: &Literal) -> ScalarValue {
+    match literal {
+        Literal::Primitive(primitive) => match primitive {
+            PrimitiveLiteral::Boolean(v) => ScalarValue::Boolean(Some(*v)),
+            PrimitiveLiteral::Int(v) => ScalarValue::Int32(Some(*v)),
+            PrimitiveLiteral::Long(v) => ScalarValue::Int64(Some(*v)),
+            PrimitiveLiteral::Float(v) => ScalarValue::Float32(Some(v.into_inner())),
+            PrimitiveLiteral::Double(v) => ScalarValue::Float64(Some(v.into_inner())),
+            PrimitiveLiteral::String(v) => ScalarValue::Utf8(Some(v.clone())),
+            PrimitiveLiteral::Binary(v) => ScalarValue::Binary(Some(v.clone())),
+            PrimitiveLiteral::Int128(v) => ScalarValue::Decimal128(Some(*v), 38, 0),
+            PrimitiveLiteral::UInt128(v) => {
+                if *v <= i128::MAX as u128 {
+                    ScalarValue::Decimal128(Some(*v as i128), 38, 0)
+                } else {
+                    ScalarValue::Utf8(Some(v.to_string()))
+                }
+            }
+        },
+        Literal::Struct(fields) => {
+            let json_repr = serde_json::to_string(fields).unwrap_or_default();
+            ScalarValue::Utf8(Some(json_repr))
+        }
+        Literal::List(items) => {
+            let json_repr = serde_json::to_string(items).unwrap_or_default();
+            ScalarValue::Utf8(Some(json_repr))
+        }
+        Literal::Map(pairs) => {
+            let json_repr = serde_json::to_string(pairs).unwrap_or_default();
+            ScalarValue::Utf8(Some(json_repr))
+        }
+    }
+}
diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs
index 59e1655e67..d705cab0c7 100644
--- a/crates/sail-iceberg/src/datasource/provider.rs
+++ b/crates/sail-iceberg/src/datasource/provider.rs
@@ -29,9 +29,10 @@ use url::Url;
 use crate::arrow_conversion::iceberg_schema_to_arrow;
 use crate::datasource::expr_adapter::IcebergPhysicalExprAdapterFactory;
 use crate::datasource::expressions::simplify_expr;
+use crate::datasource::literal_to_scalar_value;
 use crate::datasource::pruning::{prune_files, prune_manifests_by_partition_summaries};
 use crate::spec::manifest::DataContentType;
-use crate::spec::types::values::{Literal, PrimitiveLiteral};
+use crate::spec::types::values::Literal;
 use crate::spec::{
     DataFile, FormatVersion, Manifest, ManifestContentType, ManifestList, ManifestStatus,
     PartitionSpec, Schema, Snapshot,
@@ -370,7 +371,7 @@ impl IcebergTableProvider {
                 .partition()
                 .iter()
                 .map(|literal_opt| match literal_opt {
-                    Some(literal) => self.literal_to_scalar_value(literal),
+                    Some(literal) => literal_to_scalar_value(literal),
                     None => ScalarValue::Null,
                 })
                 .collect();
@@ -442,7 +443,7 @@ impl IcebergTableProvider {
                 // min
                 if let Some(d) = df.lower_bounds().get(field_id) {
                     let v = Literal::Primitive(d.literal.clone());
-                    let sv = self.literal_to_scalar_value(&v);
+                    let sv = literal_to_scalar_value(&v);
                     min_scalars[col_idx] = match (&min_scalars[col_idx], &sv) {
                         (None, s) => Some(s.clone()),
                         (Some(existing), s) => Some(if s < existing {
@@ -456,7 +457,7 @@ impl IcebergTableProvider {
                 // max
                 if let Some(d) = df.upper_bounds().get(field_id) {
                     let v = Literal::Primitive(d.literal.clone());
-                    let sv = self.literal_to_scalar_value(&v);
+                    let sv = literal_to_scalar_value(&v);
                     max_scalars[col_idx] = match (&max_scalars[col_idx], &sv) {
                         (None, s) => Some(s.clone()),
                         (Some(existing), s) => Some(if s > existing {
@@ -492,41 +493,6 @@ impl IcebergTableProvider {
         }
     }
 
-    /// Convert Iceberg Literal to DataFusion ScalarValue
-    fn literal_to_scalar_value(&self, literal: &Literal) -> ScalarValue {
-        match literal {
-            Literal::Primitive(primitive) => match primitive {
-                PrimitiveLiteral::Boolean(v) => ScalarValue::Boolean(Some(*v)),
-                PrimitiveLiteral::Int(v) => ScalarValue::Int32(Some(*v)),
-                PrimitiveLiteral::Long(v) => ScalarValue::Int64(Some(*v)),
-                PrimitiveLiteral::Float(v) => ScalarValue::Float32(Some(v.into_inner())),
-                PrimitiveLiteral::Double(v) => ScalarValue::Float64(Some(v.into_inner())),
-                PrimitiveLiteral::String(v) => ScalarValue::Utf8(Some(v.clone())),
-                PrimitiveLiteral::Binary(v) => ScalarValue::Binary(Some(v.clone())),
-                PrimitiveLiteral::Int128(v) => ScalarValue::Decimal128(Some(*v), 38, 0),
-                PrimitiveLiteral::UInt128(v) => {
-                    if *v <= i128::MAX as u128 {
-                        ScalarValue::Decimal128(Some(*v as i128), 38, 0)
-                    } else {
-                        ScalarValue::Utf8(Some(v.to_string()))
-                    }
-                }
-            },
-            Literal::Struct(fields) => {
-                let json_repr = serde_json::to_string(fields).unwrap_or_default();
-                ScalarValue::Utf8(Some(json_repr))
-            }
-            Literal::List(items) => {
-                let json_repr = serde_json::to_string(items).unwrap_or_default();
-                ScalarValue::Utf8(Some(json_repr))
-            }
-            Literal::Map(pairs) => {
-                let json_repr = serde_json::to_string(pairs).unwrap_or_default();
-                ScalarValue::Utf8(Some(json_repr))
-            }
-        }
-    }
-
     /// Create file statistics from Iceberg data file metadata
     fn create_file_statistics(&self, data_file: &DataFile) -> Statistics {
         let num_rows = Precision::Exact(data_file.record_count() as usize);
@@ -560,7 +526,7 @@ impl IcebergTableProvider {
                     .map(|datum| {
                         // convert Datum -> Literal for existing scalar conversion
                         let lit = Literal::Primitive(datum.literal.clone());
-                        self.literal_to_scalar_value(&lit)
+                        literal_to_scalar_value(&lit)
                     })
                     .map(Precision::Exact)
                     .unwrap_or(Precision::Absent);
@@ -570,7 +536,7 @@ impl IcebergTableProvider {
                     .get(&field_id)
                     .map(|datum| {
                         let lit = Literal::Primitive(datum.literal.clone());
-                        self.literal_to_scalar_value(&lit)
+                        literal_to_scalar_value(&lit)
                     })
                     .map(Precision::Exact)
                     .unwrap_or(Precision::Absent);
diff --git a/crates/sail-iceberg/src/datasource/pruning.rs b/crates/sail-iceberg/src/datasource/pruning.rs
index dc244b6d49..a4a012f018 100644
--- a/crates/sail-iceberg/src/datasource/pruning.rs
+++ b/crates/sail-iceberg/src/datasource/pruning.rs
@@ -11,65 +11,13 @@ use datafusion::logical_expr::utils::conjunction;
 use datafusion::logical_expr::{BinaryExpr, Expr, Operator};
 use datafusion::physical_optimizer::pruning::PruningPredicate;
 
+use crate::datasource::literal_to_scalar_value;
 use crate::spec::partition::PartitionSpec;
 use crate::spec::types::values::{Datum, Literal};
 use crate::spec::types::{PrimitiveType, Type};
 use crate::spec::{DataFile, Manifest, ManifestContentType, ManifestList, Schema};
 // TODO: Implement robust logical expression parsing for summary pruning
 
-pub(crate) fn literal_to_scalar_value_local(
-    literal: &Literal,
-) -> datafusion::common::scalar::ScalarValue {
-    // TODO: Add Decimal/UUID/Fixed conversion with precise semantics and timezones
-    match literal {
-        Literal::Primitive(p) => match p {
-            crate::spec::types::values::PrimitiveLiteral::Boolean(v) => {
-                datafusion::common::scalar::ScalarValue::Boolean(Some(*v))
-            }
-            crate::spec::types::values::PrimitiveLiteral::Int(v) => {
-                datafusion::common::scalar::ScalarValue::Int32(Some(*v))
-            }
-            crate::spec::types::values::PrimitiveLiteral::Long(v) => {
-                datafusion::common::scalar::ScalarValue::Int64(Some(*v))
-            }
-            crate::spec::types::values::PrimitiveLiteral::Float(v) => {
-                datafusion::common::scalar::ScalarValue::Float32(Some(v.into_inner()))
-            }
-            crate::spec::types::values::PrimitiveLiteral::Double(v) => {
-                datafusion::common::scalar::ScalarValue::Float64(Some(v.into_inner()))
-            }
-            crate::spec::types::values::PrimitiveLiteral::String(v) => {
-                datafusion::common::scalar::ScalarValue::Utf8(Some(v.clone()))
-            }
-            crate::spec::types::values::PrimitiveLiteral::Binary(v) => {
-                datafusion::common::scalar::ScalarValue::Binary(Some(v.clone()))
-            }
-            crate::spec::types::values::PrimitiveLiteral::Int128(v) => {
-                datafusion::common::scalar::ScalarValue::Decimal128(Some(*v), 38, 0)
-            }
-            crate::spec::types::values::PrimitiveLiteral::UInt128(v) => {
-                if *v <= i128::MAX as u128 {
-                    datafusion::common::scalar::ScalarValue::Decimal128(Some(*v as i128), 38, 0)
-                } else {
-                    datafusion::common::scalar::ScalarValue::Utf8(Some(v.to_string()))
-                }
-            }
-        },
-        Literal::Struct(fields) => {
-            let json_repr = serde_json::to_string(fields).unwrap_or_default();
-            datafusion::common::scalar::ScalarValue::Utf8(Some(json_repr))
-        }
-        Literal::List(items) => {
-            let json_repr = serde_json::to_string(items).unwrap_or_default();
-            datafusion::common::scalar::ScalarValue::Utf8(Some(json_repr))
-        }
-        Literal::Map(pairs) => {
-            let json_repr = serde_json::to_string(pairs).unwrap_or_default();
-            datafusion::common::scalar::ScalarValue::Utf8(Some(json_repr))
-        }
-    }
-}
-
 /// Pruning statistics over Iceberg DataFiles
 pub struct IcebergPruningStats {
     files: Vec<DataFile>,
@@ -126,31 +74,31 @@ impl IcebergPruningStats {
                 if let crate::spec::types::values::PrimitiveLiteral::Int(v) = datum.literal {
                     SV::Date32(Some(v))
                 } else {
-                    literal_to_scalar_value_local(&Literal::Primitive(datum.literal.clone()))
+                    literal_to_scalar_value(&Literal::Primitive(datum.literal.clone()))
                 }
             }
             Some(PrimitiveType::Timestamp | PrimitiveType::TimestampNs) => {
                 if let crate::spec::types::values::PrimitiveLiteral::Long(v) = datum.literal {
                     SV::TimestampMicrosecond(Some(v), None)
                 } else {
-                    literal_to_scalar_value_local(&Literal::Primitive(datum.literal.clone()))
+                    literal_to_scalar_value(&Literal::Primitive(datum.literal.clone()))
                 }
             }
             Some(PrimitiveType::Timestamptz | PrimitiveType::TimestamptzNs) => {
                 if let crate::spec::types::values::PrimitiveLiteral::Long(v) = datum.literal {
                     SV::TimestampMicrosecond(Some(v), Some(std::sync::Arc::from("UTC")))
                 } else {
-                    literal_to_scalar_value_local(&Literal::Primitive(datum.literal.clone()))
+                    literal_to_scalar_value(&Literal::Primitive(datum.literal.clone()))
                 }
             }
             Some(PrimitiveType::Time) => {
                 if let crate::spec::types::values::PrimitiveLiteral::Long(v) = datum.literal {
                     SV::Time64Microsecond(Some(v))
                 } else {
-                    literal_to_scalar_value_local(&Literal::Primitive(datum.literal.clone()))
+                    literal_to_scalar_value(&Literal::Primitive(datum.literal.clone()))
                 }
             }
-            _ => literal_to_scalar_value_local(&Literal::Primitive(datum.literal.clone())),
+            _ => literal_to_scalar_value(&Literal::Primitive(datum.literal.clone())),
         }
     }
 }
diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_io.py b/python/pysail/tests/spark/iceberg/test_iceberg_io.py
index 9a32254bb7..6a27e930cd 100644
--- a/python/pysail/tests/spark/iceberg/test_iceberg_io.py
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_io.py
@@ -5,6 +5,8 @@
 from pyiceberg.schema import Schema
 from pyiceberg.types import DoubleType, LongType, NestedField, StringType
 
+from pysail.tests.spark.utils import escape_sql_string_literal
+
 
 @pytest.fixture
 def iceberg_test_data():
@@ -75,7 +77,7 @@ def test_iceberg_io_read_with_sql(spark, iceberg_test_data, expected_pandas_df,
 
         table_path = table.location()
 
-        spark.sql(f"CREATE TABLE my_iceberg USING iceberg LOCATION '{table_path}'")
+        spark.sql(f"CREATE TABLE my_iceberg USING iceberg LOCATION '{escape_sql_string_literal(table_path)}'")
 
         try:
             result_df = spark.sql("SELECT * FROM my_iceberg").sort("id")
diff --git a/python/pysail/tests/spark/iceberg/test_iceberg_reads.py b/python/pysail/tests/spark/iceberg/test_iceberg_reads.py
index ec00858c37..2c0a465a28 100644
--- a/python/pysail/tests/spark/iceberg/test_iceberg_reads.py
+++ b/python/pysail/tests/spark/iceberg/test_iceberg_reads.py
@@ -1,17 +1,15 @@
 import math
 
 import pyarrow as pa
-import pytest
 from pyiceberg.schema import Schema
 from pyiceberg.types import BooleanType, DoubleType, NestedField, StringType, TimestampType
 
 from .utils import create_sql_catalog  # noqa: TID252
 
 
-@pytest.mark.parametrize("use_rewritten", [False, True])
-def test_nan_reads(spark, tmp_path, use_rewritten):
+def test_nan_reads(spark, tmp_path):
     catalog = create_sql_catalog(tmp_path)
-    identifier = f"default.test_nan_reads_{'rewritten' if use_rewritten else 'orig'}"
+    identifier = "default.test_nan_reads"
     table = catalog.create_table(
         identifier=identifier,
         schema=Schema(