@@ -138,6 +138,35 @@ pub(crate) async fn read_snapshot<'metadata>(
138138 ManifestListReader :: new ( bytes, table_metadata)
139139}
140140
141+ /// A writer for Iceberg manifest list files that manages the creation and updating of manifest lists.
142+ ///
143+ /// The ManifestListWriter is responsible for:
144+ /// - Creating new manifest list files from scratch or updating existing ones
145+ /// - Managing manifest entries and their metadata
146+ /// - Optimizing data file organization through splitting and partitioning
147+ /// - Writing the final manifest list to object storage
148+ ///
149+ /// This writer can operate in two modes:
150+ /// 1. **New manifest list**: Creates a completely new manifest list from data files
151+ /// 2. **Append to existing**: Reuses compatible manifests from an existing manifest list
152+ ///
153+ /// The writer automatically handles:
154+ /// - Partition boundary calculations
155+ /// - Manifest splitting for optimal performance
156+ /// - Schema compatibility between format versions
157+ /// - Concurrent manifest writing operations
158+ ///
159+ /// # Type Parameters
160+ /// * `'schema` - The lifetime of the Avro schema used for serialization
161+ /// * `'metadata` - The lifetime of the table metadata reference
162+ ///
163+ /// # Fields
164+ /// * `table_metadata` - Reference to the table metadata for schema and configuration
165+ /// * `writer` - The underlying Avro writer for manifest list serialization
166+ /// * `selected_manifest` - Optional existing manifest that can be reused for appends
167+ /// * `bounding_partition_values` - Computed partition boundaries for the data files
168+ /// * `n_existing_files` - Count of existing files for split calculations
169+ /// * `branch` - Optional branch name for multi-branch table operations
141170pub ( crate ) struct ManifestListWriter < ' schema , ' metadata > {
142171 table_metadata : & ' metadata TableMetadata ,
143172 writer : AvroWriter < ' schema , Vec < u8 > > ,
@@ -148,6 +177,36 @@ pub(crate) struct ManifestListWriter<'schema, 'metadata> {
148177}
149178
150179impl < ' schema , ' metadata > ManifestListWriter < ' schema , ' metadata > {
180+ /// Creates a new ManifestListWriter for building a manifest list from scratch.
181+ ///
182+ /// This constructor initializes a writer that will create a completely new manifest list
183+ /// without reusing any existing manifests. It computes partition boundaries from the
184+ /// provided data files and sets up the Avro writer with the appropriate schema.
185+ ///
186+ /// # Arguments
187+ /// * `data_files` - Iterator over data files to compute partition boundaries from
188+ /// * `schema` - The Avro schema to use for manifest list serialization
189+ /// * `table_metadata` - Reference to the table metadata for partition field information
190+ /// * `branch` - Optional branch name for multi-branch table operations
191+ ///
192+ /// # Returns
193+ /// * `Result<Self, Error>` - A new ManifestListWriter instance or an error
194+ ///
195+ /// # Errors
196+ /// Returns an error if:
197+ /// * The partition fields cannot be retrieved from table metadata
198+ /// * Partition boundary computation fails
199+ /// * The Avro writer cannot be initialized
200+ ///
201+ /// # Example Usage
202+ /// ```ignore
203+ /// let writer = ManifestListWriter::new(
204+ /// data_files.iter(),
205+ /// &manifest_list_schema,
206+ /// &table_metadata,
207+ /// Some("main"),
208+ /// )?;
209+ /// ```
151210 pub ( crate ) fn new < ' datafiles > (
152211 data_files : impl Iterator < Item = & ' datafiles DataFile > ,
153212 schema : & ' schema AvroSchema ,
@@ -176,6 +235,48 @@ impl<'schema, 'metadata> ManifestListWriter<'schema, 'metadata> {
176235 } )
177236 }
178237
238+ /// Creates a new ManifestListWriter from an existing manifest list, optimizing for append operations.
239+ ///
240+ /// This constructor analyzes an existing manifest list to determine which manifests can be
241+ /// reused for the new operation. It selects compatible manifests based on partition boundaries
242+ /// and copies other manifests to the new manifest list. This approach optimizes append
243+ /// operations by avoiding unnecessary manifest rewrites.
244+ ///
245+ /// The method:
246+ /// 1. Reads the existing manifest list to understand current manifests
247+ /// 2. Computes partition boundaries for the new data files
248+ /// 3. Selects manifests that can be reused (partitioned vs unpartitioned logic)
249+ /// 4. Copies non-selected manifests to the new manifest list
250+ /// 5. Prepares to append new data to the selected manifest
251+ ///
252+ /// # Arguments
253+ /// * `bytes` - The raw bytes of the existing manifest list file
254+ /// * `data_files` - Iterator over new data files to be appended
255+ /// * `schema` - The Avro schema to use for manifest list serialization
256+ /// * `table_metadata` - Reference to the table metadata for partition field information
257+ /// * `branch` - Optional branch name for multi-branch table operations
258+ ///
259+ /// # Returns
260+ /// * `Result<Self, Error>` - A new ManifestListWriter instance with selected manifest or an error
261+ ///
262+ /// # Errors
263+ /// Returns an error if:
264+ /// * The existing manifest list cannot be parsed
265+ /// * Partition fields cannot be retrieved from table metadata
266+ /// * Partition boundary computation fails
267+ /// * Manifest selection logic fails
268+ /// * The Avro writer cannot be initialized
269+ ///
270+ /// # Example Usage
271+ /// ```ignore
272+ /// let writer = ManifestListWriter::from_existing(
273+ /// &existing_manifest_list_bytes,
274+ /// new_data_files.iter(),
275+ /// &manifest_list_schema,
276+ /// &table_metadata,
277+ /// Some("main"),
278+ /// )?;
279+ /// ```
179280 pub ( crate ) fn from_existing < ' datafiles > (
180281 bytes : & [ u8 ] ,
181282 data_files : impl Iterator < Item = & ' datafiles DataFile > ,
@@ -220,6 +321,27 @@ impl<'schema, 'metadata> ManifestListWriter<'schema, 'metadata> {
220321 } )
221322 }
222323
324+ /// Calculates the optimal number of manifest splits for the given number of data files.
325+ ///
326+ /// This method determines how many manifest files should be created to optimize
327+ /// query performance and manage file sizes. The calculation considers:
328+ /// - The number of existing files in the table
329+ /// - The number of new data files being added
330+ /// - The number of files in any selected (reusable) manifest
331+ ///
332+ /// The splitting strategy helps maintain optimal manifest sizes for efficient
333+ /// query planning and metadata operations.
334+ ///
335+ /// # Arguments
336+ /// * `n_data_files` - The number of new data files being added
337+ ///
338+ /// # Returns
339+ /// * `u32` - The recommended number of manifest splits
340+ ///
341+ /// # Example Usage
342+ /// ```ignore
343+ /// let splits = writer.n_splits(1000); // Calculate splits for 1000 new files
344+ /// ```
223345 pub ( crate ) fn n_splits ( & self , n_data_files : usize ) -> u32 {
224346 let selected_manifest_file_count = self
225347 . selected_manifest
@@ -244,6 +366,48 @@ impl<'schema, 'metadata> ManifestListWriter<'schema, 'metadata> {
244366 )
245367 }
246368
369+ /// Appends data files to a single manifest and finalizes the manifest list.
370+ ///
371+ /// This method creates a single manifest file containing all the provided data files,
372+ /// either by appending to an existing reusable manifest or creating a new one.
373+ /// It then writes the complete manifest list to object storage.
374+ ///
375+ /// This approach is optimal for:
376+ /// - Small to medium append operations
377+ /// - Cases where manifest splitting is not required
378+ /// - Simple append operations without complex partitioning needs
379+ ///
380+ /// The process:
381+ /// 1. Determines whether to reuse an existing manifest or create new one
382+ /// 2. Creates/updates a manifest writer with the selected manifest
383+ /// 3. Appends all provided data files to the manifest
384+ /// 4. Finalizes the manifest and writes it to storage
385+ /// 5. Adds the manifest entry to the manifest list
386+ /// 6. Writes the complete manifest list to storage
387+ ///
388+ /// # Arguments
389+ /// * `data_files` - Iterator over manifest entries to append
390+ /// * `snapshot_id` - The snapshot ID for the new manifest
391+ /// * `object_store` - The object store for writing files
392+ ///
393+ /// # Returns
394+ /// * `Result<String, Error>` - The location of the new manifest list file or an error
395+ ///
396+ /// # Errors
397+ /// Returns an error if:
398+ /// * Manifest schema creation fails
399+ /// * Manifest writer creation or operation fails
400+ /// * Object storage operations fail
401+ /// * Avro serialization fails
402+ ///
403+ /// # Example Usage
404+ /// ```ignore
405+ /// let manifest_list_location = writer.append_and_finish(
406+ /// data_files_iter,
407+ /// snapshot_id,
408+ /// object_store,
409+ /// ).await?;
410+ /// ```
247411 pub ( crate ) async fn append_and_finish (
248412 mut self ,
249413 data_files : impl Iterator < Item = Result < ManifestEntry , Error > > ,
@@ -314,6 +478,54 @@ impl<'schema, 'metadata> ManifestListWriter<'schema, 'metadata> {
314478 Ok ( new_manifest_list_location)
315479 }
316480
481+ /// Appends data files by splitting them across multiple manifests and finalizes the manifest list.
482+ ///
483+ /// This method is designed for large append operations where splitting data files across
484+ /// multiple manifest files provides better query performance and parallelism. It distributes
485+ /// the data files across the specified number of splits based on partition boundaries.
486+ ///
487+ /// This approach is optimal for:
488+ /// - Large append operations with hundreds or thousands of files
489+ /// - Partitioned tables where files can be split by partition boundaries
490+ /// - Cases requiring high query parallelism and performance
491+ ///
492+ /// The process:
493+ /// 1. Computes optimal partition boundaries for splitting
494+ /// 2. Merges new data files with existing files from selected manifest (if any)
495+ /// 3. Splits all files across the specified number of manifest files
496+ /// 4. Creates and writes multiple manifest files concurrently
497+ /// 5. Adds all manifest entries to the manifest list
498+ /// 6. Writes the complete manifest list to storage
499+ ///
500+ /// # Arguments
501+ /// * `data_files` - Iterator over manifest entries to append and split
502+ /// * `snapshot_id` - The snapshot ID for the new manifests
503+ /// * `n_splits` - The number of manifest files to create (should match `n_splits()` result)
504+ /// * `object_store` - The object store for writing files
505+ ///
506+ /// # Returns
507+ /// * `Result<String, Error>` - The location of the new manifest list file or an error
508+ ///
509+ /// # Errors
510+ /// Returns an error if:
511+ /// * Partition field retrieval fails
512+ /// * Manifest schema creation fails
513+ /// * File splitting logic fails
514+ /// * Manifest writer creation or operation fails
515+ /// * Concurrent manifest writing fails
516+ /// * Object storage operations fail
517+ /// * Avro serialization fails
518+ ///
519+ /// # Example Usage
520+ /// ```ignore
521+ /// let n_splits = writer.n_splits(data_files.len());
522+ /// let manifest_list_location = writer.append_split_and_finish(
523+ /// data_files_iter,
524+ /// snapshot_id,
525+ /// n_splits,
526+ /// object_store,
527+ /// ).await?;
528+ /// ```
317529 pub ( crate ) async fn append_split_and_finish (
318530 mut self ,
319531 data_files : impl Iterator < Item = Result < ManifestEntry , Error > > ,
0 commit comments