Skip to content

Commit 2a5c5e0

Browse files
author
Jan Kaul
committed
documentation
1 parent eeabfc6 commit 2a5c5e0

1 file changed

Lines changed: 212 additions & 0 deletions

File tree

iceberg-rust/src/table/manifest_list.rs

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,35 @@ pub(crate) async fn read_snapshot<'metadata>(
138138
ManifestListReader::new(bytes, table_metadata)
139139
}
140140

141+
/// A writer for Iceberg manifest list files that manages the creation and updating of manifest lists.
142+
///
143+
/// The ManifestListWriter is responsible for:
144+
/// - Creating new manifest list files from scratch or updating existing ones
145+
/// - Managing manifest entries and their metadata
146+
/// - Optimizing data file organization through splitting and partitioning
147+
/// - Writing the final manifest list to object storage
148+
///
149+
/// This writer can operate in two modes:
150+
/// 1. **New manifest list**: Creates a completely new manifest list from data files
151+
/// 2. **Append to existing**: Reuses compatible manifests from an existing manifest list
152+
///
153+
/// The writer automatically handles:
154+
/// - Partition boundary calculations
155+
/// - Manifest splitting for optimal performance
156+
/// - Schema compatibility between format versions
157+
/// - Concurrent manifest writing operations
158+
///
159+
/// # Type Parameters
160+
/// * `'schema` - The lifetime of the Avro schema used for serialization
161+
/// * `'metadata` - The lifetime of the table metadata reference
162+
///
163+
/// # Fields
164+
/// * `table_metadata` - Reference to the table metadata for schema and configuration
165+
/// * `writer` - The underlying Avro writer for manifest list serialization
166+
/// * `selected_manifest` - Optional existing manifest that can be reused for appends
167+
/// * `bounding_partition_values` - Computed partition boundaries for the data files
168+
/// * `n_existing_files` - Count of existing files for split calculations
169+
/// * `branch` - Optional branch name for multi-branch table operations
141170
pub(crate) struct ManifestListWriter<'schema, 'metadata> {
142171
table_metadata: &'metadata TableMetadata,
143172
writer: AvroWriter<'schema, Vec<u8>>,
@@ -148,6 +177,36 @@ pub(crate) struct ManifestListWriter<'schema, 'metadata> {
148177
}
149178

150179
impl<'schema, 'metadata> ManifestListWriter<'schema, 'metadata> {
180+
/// Creates a new ManifestListWriter for building a manifest list from scratch.
181+
///
182+
/// This constructor initializes a writer that will create a completely new manifest list
183+
/// without reusing any existing manifests. It computes partition boundaries from the
184+
/// provided data files and sets up the Avro writer with the appropriate schema.
185+
///
186+
/// # Arguments
187+
/// * `data_files` - Iterator over data files to compute partition boundaries from
188+
/// * `schema` - The Avro schema to use for manifest list serialization
189+
/// * `table_metadata` - Reference to the table metadata for partition field information
190+
/// * `branch` - Optional branch name for multi-branch table operations
191+
///
192+
/// # Returns
193+
/// * `Result<Self, Error>` - A new ManifestListWriter instance or an error
194+
///
195+
/// # Errors
196+
/// Returns an error if:
197+
/// * The partition fields cannot be retrieved from table metadata
198+
/// * Partition boundary computation fails
199+
/// * The Avro writer cannot be initialized
200+
///
201+
/// # Example Usage
202+
/// ```ignore
203+
/// let writer = ManifestListWriter::new(
204+
/// data_files.iter(),
205+
/// &manifest_list_schema,
206+
/// &table_metadata,
207+
/// Some("main"),
208+
/// )?;
209+
/// ```
151210
pub(crate) fn new<'datafiles>(
152211
data_files: impl Iterator<Item = &'datafiles DataFile>,
153212
schema: &'schema AvroSchema,
@@ -176,6 +235,48 @@ impl<'schema, 'metadata> ManifestListWriter<'schema, 'metadata> {
176235
})
177236
}
178237

238+
/// Creates a new ManifestListWriter from an existing manifest list, optimizing for append operations.
239+
///
240+
/// This constructor analyzes an existing manifest list to determine which manifests can be
241+
/// reused for the new operation. It selects compatible manifests based on partition boundaries
242+
/// and copies other manifests to the new manifest list. This approach optimizes append
243+
/// operations by avoiding unnecessary manifest rewrites.
244+
///
245+
/// The method:
246+
/// 1. Reads the existing manifest list to understand current manifests
247+
/// 2. Computes partition boundaries for the new data files
248+
/// 3. Selects manifests that can be reused (partitioned vs unpartitioned logic)
249+
/// 4. Copies non-selected manifests to the new manifest list
250+
/// 5. Prepares to append new data to the selected manifest
251+
///
252+
/// # Arguments
253+
/// * `bytes` - The raw bytes of the existing manifest list file
254+
/// * `data_files` - Iterator over new data files to be appended
255+
/// * `schema` - The Avro schema to use for manifest list serialization
256+
/// * `table_metadata` - Reference to the table metadata for partition field information
257+
/// * `branch` - Optional branch name for multi-branch table operations
258+
///
259+
/// # Returns
260+
/// * `Result<Self, Error>` - A new ManifestListWriter instance with selected manifest or an error
261+
///
262+
/// # Errors
263+
/// Returns an error if:
264+
/// * The existing manifest list cannot be parsed
265+
/// * Partition fields cannot be retrieved from table metadata
266+
/// * Partition boundary computation fails
267+
/// * Manifest selection logic fails
268+
/// * The Avro writer cannot be initialized
269+
///
270+
/// # Example Usage
271+
/// ```ignore
272+
/// let writer = ManifestListWriter::from_existing(
273+
/// &existing_manifest_list_bytes,
274+
/// new_data_files.iter(),
275+
/// &manifest_list_schema,
276+
/// &table_metadata,
277+
/// Some("main"),
278+
/// )?;
279+
/// ```
179280
pub(crate) fn from_existing<'datafiles>(
180281
bytes: &[u8],
181282
data_files: impl Iterator<Item = &'datafiles DataFile>,
@@ -220,6 +321,27 @@ impl<'schema, 'metadata> ManifestListWriter<'schema, 'metadata> {
220321
})
221322
}
222323

324+
/// Calculates the optimal number of manifest splits for the given number of data files.
325+
///
326+
/// This method determines how many manifest files should be created to optimize
327+
/// query performance and manage file sizes. The calculation considers:
328+
/// - The number of existing files in the table
329+
/// - The number of new data files being added
330+
/// - The number of files in any selected (reusable) manifest
331+
///
332+
/// The splitting strategy helps maintain optimal manifest sizes for efficient
333+
/// query planning and metadata operations.
334+
///
335+
/// # Arguments
336+
/// * `n_data_files` - The number of new data files being added
337+
///
338+
/// # Returns
339+
/// * `u32` - The recommended number of manifest splits
340+
///
341+
/// # Example Usage
342+
/// ```ignore
343+
/// let splits = writer.n_splits(1000); // Calculate splits for 1000 new files
344+
/// ```
223345
pub(crate) fn n_splits(&self, n_data_files: usize) -> u32 {
224346
let selected_manifest_file_count = self
225347
.selected_manifest
@@ -244,6 +366,48 @@ impl<'schema, 'metadata> ManifestListWriter<'schema, 'metadata> {
244366
)
245367
}
246368

369+
/// Appends data files to a single manifest and finalizes the manifest list.
370+
///
371+
/// This method creates a single manifest file containing all the provided data files,
372+
/// either by appending to an existing reusable manifest or creating a new one.
373+
/// It then writes the complete manifest list to object storage.
374+
///
375+
/// This approach is optimal for:
376+
/// - Small to medium append operations
377+
/// - Cases where manifest splitting is not required
378+
/// - Simple append operations without complex partitioning needs
379+
///
380+
/// The process:
381+
/// 1. Determines whether to reuse an existing manifest or create new one
382+
/// 2. Creates/updates a manifest writer with the selected manifest
383+
/// 3. Appends all provided data files to the manifest
384+
/// 4. Finalizes the manifest and writes it to storage
385+
/// 5. Adds the manifest entry to the manifest list
386+
/// 6. Writes the complete manifest list to storage
387+
///
388+
/// # Arguments
389+
/// * `data_files` - Iterator over manifest entries to append
390+
/// * `snapshot_id` - The snapshot ID for the new manifest
391+
/// * `object_store` - The object store for writing files
392+
///
393+
/// # Returns
394+
/// * `Result<String, Error>` - The location of the new manifest list file or an error
395+
///
396+
/// # Errors
397+
/// Returns an error if:
398+
/// * Manifest schema creation fails
399+
/// * Manifest writer creation or operation fails
400+
/// * Object storage operations fail
401+
/// * Avro serialization fails
402+
///
403+
/// # Example Usage
404+
/// ```ignore
405+
/// let manifest_list_location = writer.append_and_finish(
406+
/// data_files_iter,
407+
/// snapshot_id,
408+
/// object_store,
409+
/// ).await?;
410+
/// ```
247411
pub(crate) async fn append_and_finish(
248412
mut self,
249413
data_files: impl Iterator<Item = Result<ManifestEntry, Error>>,
@@ -314,6 +478,54 @@ impl<'schema, 'metadata> ManifestListWriter<'schema, 'metadata> {
314478
Ok(new_manifest_list_location)
315479
}
316480

481+
/// Appends data files by splitting them across multiple manifests and finalizes the manifest list.
482+
///
483+
/// This method is designed for large append operations where splitting data files across
484+
/// multiple manifest files provides better query performance and parallelism. It distributes
485+
/// the data files across the specified number of splits based on partition boundaries.
486+
///
487+
/// This approach is optimal for:
488+
/// - Large append operations with hundreds or thousands of files
489+
/// - Partitioned tables where files can be split by partition boundaries
490+
/// - Cases requiring high query parallelism and performance
491+
///
492+
/// The process:
493+
/// 1. Computes optimal partition boundaries for splitting
494+
/// 2. Merges new data files with existing files from selected manifest (if any)
495+
/// 3. Splits all files across the specified number of manifest files
496+
/// 4. Creates and writes multiple manifest files concurrently
497+
/// 5. Adds all manifest entries to the manifest list
498+
/// 6. Writes the complete manifest list to storage
499+
///
500+
/// # Arguments
501+
/// * `data_files` - Iterator over manifest entries to append and split
502+
/// * `snapshot_id` - The snapshot ID for the new manifests
503+
/// * `n_splits` - The number of manifest files to create (should match `n_splits()` result)
504+
/// * `object_store` - The object store for writing files
505+
///
506+
/// # Returns
507+
/// * `Result<String, Error>` - The location of the new manifest list file or an error
508+
///
509+
/// # Errors
510+
/// Returns an error if:
511+
/// * Partition field retrieval fails
512+
/// * Manifest schema creation fails
513+
/// * File splitting logic fails
514+
/// * Manifest writer creation or operation fails
515+
/// * Concurrent manifest writing fails
516+
/// * Object storage operations fail
517+
/// * Avro serialization fails
518+
///
519+
/// # Example Usage
520+
/// ```ignore
521+
/// let n_splits = writer.n_splits(data_files.len());
522+
/// let manifest_list_location = writer.append_split_and_finish(
523+
/// data_files_iter,
524+
/// snapshot_id,
525+
/// n_splits,
526+
/// object_store,
527+
/// ).await?;
528+
/// ```
317529
pub(crate) async fn append_split_and_finish(
318530
mut self,
319531
data_files: impl Iterator<Item = Result<ManifestEntry, Error>>,

0 commit comments

Comments
 (0)