Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion dev/docker-compose-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@
services:
spark-iceberg:
image: pyiceberg-spark:latest
build: spark/
build:
context: spark/
args:
ICEBERG_MAVEN_MIRROR: https://repository.apache.org/content/repositories/orgapacheiceberg-1278
container_name: pyiceberg-spark
networks:
iceberg_net:
Expand Down
14 changes: 12 additions & 2 deletions dev/spark/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}

# Dependency versions - keep these compatible
# Changing these will invalidate the JAR download cache layer
ARG ICEBERG_VERSION=1.10.1
ARG ICEBERG_VERSION=1.11.0
ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13
ARG HADOOP_VERSION=3.4.1
ARG AWS_SDK_VERSION=2.24.6
ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2
ARG ICEBERG_MAVEN_MIRROR=${MAVEN_MIRROR}

USER root
WORKDIR ${SPARK_HOME}
Expand All @@ -38,11 +39,20 @@ RUN apt-get update -qq && \

# Download JARs with retry logic (most cacheable - only changes when versions change)
# This is the slowest step, so we do it before copying config files
# Iceberg JARs use ICEBERG_MAVEN_MIRROR (defaults to MAVEN_MIRROR, can be overridden for staging repos)
RUN set -e && \
cd "${SPARK_HOME}/jars" && \
for jar_path in \
"org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \
"org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \
"org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar"; \
do \
jar_name=$(basename "${jar_path}") && \
curl -fsSL --retry 3 --retry-delay 5 \
-o "${jar_name}" \
"${ICEBERG_MAVEN_MIRROR}/${jar_path}" && \
chown spark:spark "${jar_name}"; \
done && \
for jar_path in \
"org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \
"software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"; \
do \
Expand Down
2 changes: 2 additions & 0 deletions pyiceberg/table/inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,7 @@ def _get_all_manifests_schema(self) -> pa.Schema:

all_manifests_schema = self._get_manifests_schema()
all_manifests_schema = all_manifests_schema.append(pa.field("reference_snapshot_id", pa.int64(), nullable=False))
all_manifests_schema = all_manifests_schema.append(pa.field("key_metadata", pa.binary(), nullable=True))
return all_manifests_schema

def _generate_manifests_table(self, snapshot: Snapshot | None, is_all_manifests_table: bool = False) -> pa.Table:
Expand Down Expand Up @@ -468,6 +469,7 @@ def _partition_summaries_to_rows(
}
if is_all_manifests_table:
manifest_row["reference_snapshot_id"] = snapshot.snapshot_id
manifest_row["key_metadata"] = manifest.key_metadata
manifests.append(manifest_row)

return pa.Table.from_pylist(
Expand Down
1 change: 1 addition & 0 deletions tests/integration/test_inspect_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1012,6 +1012,7 @@ def test_inspect_all_manifests(spark: SparkSession, session_catalog: Catalog, fo
"deleted_delete_files_count",
"partition_summaries",
"reference_snapshot_id",
"key_metadata",
]

int_cols = [
Expand Down
Loading