diff --git a/dev/docker-compose-integration.yml b/dev/docker-compose-integration.yml index 387e54ac2c..d7b9468e1b 100644 --- a/dev/docker-compose-integration.yml +++ b/dev/docker-compose-integration.yml @@ -18,7 +18,10 @@ services: spark-iceberg: image: pyiceberg-spark:latest - build: spark/ + build: + context: spark/ + args: + ICEBERG_MAVEN_MIRROR: https://repository.apache.org/content/repositories/orgapacheiceberg-1278 container_name: pyiceberg-spark networks: iceberg_net: diff --git a/dev/spark/Dockerfile b/dev/spark/Dockerfile index 0e1f29d152..4a052f7dfa 100644 --- a/dev/spark/Dockerfile +++ b/dev/spark/Dockerfile @@ -19,11 +19,12 @@ FROM apache/spark:${BASE_IMAGE_SPARK_VERSION} # Dependency versions - keep these compatible # Changing these will invalidate the JAR download cache layer -ARG ICEBERG_VERSION=1.10.1 +ARG ICEBERG_VERSION=1.11.0 ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13 ARG HADOOP_VERSION=3.4.1 ARG AWS_SDK_VERSION=2.24.6 ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2 +ARG ICEBERG_MAVEN_MIRROR=${MAVEN_MIRROR} USER root WORKDIR ${SPARK_HOME} @@ -38,11 +39,20 @@ RUN apt-get update -qq && \ # Download JARs with retry logic (most cacheable - only changes when versions change) # This is the slowest step, so we do it before copying config files +# Iceberg JARs use ICEBERG_MAVEN_MIRROR (defaults to MAVEN_MIRROR, can be overridden for staging repos) RUN set -e && \ cd "${SPARK_HOME}/jars" && \ for jar_path in \ "org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \ - "org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \ + "org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar"; \ + do \ + jar_name=$(basename "${jar_path}") && \ + curl -fsSL --retry 3 --retry-delay 5 \ + -o "${jar_name}" \ + "${ICEBERG_MAVEN_MIRROR}/${jar_path}" && \ + chown spark:spark "${jar_name}"; \ + done && \ + for jar_path in \ "org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \ "software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"; \ do \ diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py index 5da343ccb6..ac8e627d7b 100644 --- a/pyiceberg/table/inspect.py +++ b/pyiceberg/table/inspect.py @@ -404,6 +404,7 @@ def _get_all_manifests_schema(self) -> pa.Schema: all_manifests_schema = self._get_manifests_schema() all_manifests_schema = all_manifests_schema.append(pa.field("reference_snapshot_id", pa.int64(), nullable=False)) + all_manifests_schema = all_manifests_schema.append(pa.field("key_metadata", pa.binary(), nullable=True)) return all_manifests_schema def _generate_manifests_table(self, snapshot: Snapshot | None, is_all_manifests_table: bool = False) -> pa.Table: @@ -468,6 +469,7 @@ def _partition_summaries_to_rows( } if is_all_manifests_table: manifest_row["reference_snapshot_id"] = snapshot.snapshot_id + manifest_row["key_metadata"] = manifest.key_metadata manifests.append(manifest_row) return pa.Table.from_pylist( diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py index 03d4437d18..4d8dfbe9bb 100644 --- a/tests/integration/test_inspect_table.py +++ b/tests/integration/test_inspect_table.py @@ -1012,6 +1012,7 @@ def test_inspect_all_manifests(spark: SparkSession, session_catalog: Catalog, fo "deleted_delete_files_count", "partition_summaries", "reference_snapshot_id", + "key_metadata", ] int_cols = [