Skip to content

Commit 2ef10f9

Browse files
committed
fix(builder): handle resume when metadata.json missing (interrupted before first batch)
When a run is interrupted before any row group or batch completes, metadata.json is never written. Previously resume=True would raise DatasetGenerationError in this case. Now build() detects the missing file, logs an info message, clears any leftover partial results and falls back to a clean fresh run. This is the common scenario for small datasets (fewer records than buffer_size) where all records fit in a single row group.
1 parent 51bf29a commit 2ef10f9

3 files changed

Lines changed: 47 additions & 12 deletions

File tree

packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,17 @@ def build(
185185
start_time = time.perf_counter()
186186
buffer_size = self._resource_provider.run_config.buffer_size
187187

188+
if resume and not self.artifact_storage.metadata_file_path.exists():
189+
# No metadata.json means the previous run was interrupted before any batch (sync) or
190+
# row group (async) completed. Nothing to resume — discard any leftover partial
191+
# results and start fresh.
192+
logger.info(
193+
"▶️ No metadata.json found — the previous run was interrupted before any batch "
194+
"completed. Starting generation from the beginning."
195+
)
196+
self.artifact_storage.clear_partial_results()
197+
resume = False
198+
188199
generated = True
189200
if DATA_DESIGNER_ASYNC_ENGINE:
190201
self._validate_async_compatibility()

packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -968,20 +968,27 @@ def _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_p
968968
)
969969

970970

971-
def test_build_resume_raises_without_metadata(stub_resource_provider, stub_test_config_builder, tmp_path):
972-
"""resume=True when only the folder exists (no metadata.json) raises DatasetGenerationError.
971+
def test_build_resume_starts_fresh_without_metadata(stub_resource_provider, stub_test_config_builder, tmp_path, caplog):
972+
"""resume=True when only the folder exists (no metadata.json) logs an info message and starts fresh.
973973
974974
This covers the case where a run was interrupted before any batch completed — the
975975
folder was created by _write_builder_config but metadata.json was never written.
976+
Previously this raised DatasetGenerationError; now it silently restarts from batch 0.
976977
"""
977978
# Pre-create the folder with content so resolved_dataset_name(resume=True) returns "dataset"
978979
dataset_dir = tmp_path / "dataset"
979980
dataset_dir.mkdir()
980981
(dataset_dir / "builder_config.json").write_text("{}") # non-empty, no metadata
981982

982983
builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path)
983-
with pytest.raises(DatasetGenerationError, match="metadata.json not found"):
984-
builder.build(num_records=4, resume=True)
984+
with caplog.at_level(logging.INFO):
985+
with patch.object(builder, "_run_model_health_check_if_needed"):
986+
with patch.object(builder, "_run_batch"):
987+
with patch.object(builder.batch_manager, "finish"):
988+
# resume=False is set internally; build dispatches to the normal (non-resume) path
989+
builder.build(num_records=4, resume=True)
990+
991+
assert any("interrupted before any batch completed" in record.message for record in caplog.records)
985992

986993

987994
def test_build_resume_raises_on_num_records_mismatch(stub_resource_provider, stub_test_config_builder, tmp_path):
@@ -1132,18 +1139,31 @@ def test_build_async_resume_logs_warning_when_already_complete(
11321139
assert any("already complete" in record.message for record in caplog.records)
11331140

11341141

1135-
def test_build_async_resume_raises_without_metadata(stub_resource_provider, stub_test_config_builder, tmp_path):
1136-
"""Async resume raises DatasetGenerationError when metadata.json is missing."""
1142+
def test_build_async_resume_starts_fresh_without_metadata(
1143+
stub_resource_provider, stub_test_config_builder, tmp_path, caplog
1144+
):
1145+
"""Async resume with no metadata.json logs an info message and starts fresh.
1146+
1147+
Previously this raised DatasetGenerationError; now it silently restarts from row group 0.
1148+
The log is emitted in build() before dispatching to _build_async, so mocking _build_async
1149+
does not suppress the message.
1150+
"""
11371151
dataset_dir = tmp_path / "dataset"
11381152
dataset_dir.mkdir()
11391153
(dataset_dir / "builder_config.json").write_text("{}")
11401154

11411155
builder = _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path)
11421156

1143-
with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True):
1144-
with patch.object(builder, "_run_model_health_check_if_needed"):
1145-
with pytest.raises(DatasetGenerationError, match="metadata.json not found"):
1146-
builder.build(num_records=4, resume=True)
1157+
with caplog.at_level(logging.INFO):
1158+
with patch.object(builder_mod, "DATA_DESIGNER_ASYNC_ENGINE", True):
1159+
with patch.object(builder, "_run_model_health_check_if_needed"):
1160+
with patch.object(builder, "_build_async", return_value=True) as mock_async:
1161+
builder.build(num_records=4, resume=True)
1162+
1163+
# _build_async is called with resume=False because the no-metadata path resets the flag
1164+
_, kwargs = mock_async.call_args
1165+
assert kwargs.get("resume") is False
1166+
assert any("interrupted before any batch completed" in record.message for record in caplog.records)
11471167

11481168

11491169
def test_build_async_resume_already_complete_does_not_run_after_generation_processors(

packages/data-designer/tests/cli/commands/test_create_command.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ def test_create_command_delegates_to_controller(mock_ctrl_cls: MagicMock) -> Non
1818
mock_ctrl = MagicMock()
1919
mock_ctrl_cls.return_value = mock_ctrl
2020

21-
create_command(config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None, output_format=None)
21+
create_command(
22+
config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None, output_format=None
23+
)
2224

2325
mock_ctrl_cls.assert_called_once()
2426
mock_ctrl.run_create.assert_called_once_with(
@@ -59,7 +61,9 @@ def test_create_command_default_artifact_path_is_none(mock_ctrl_cls: MagicMock)
5961
mock_ctrl = MagicMock()
6062
mock_ctrl_cls.return_value = mock_ctrl
6163

62-
create_command(config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None, output_format=None)
64+
create_command(
65+
config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None, output_format=None
66+
)
6367

6468
mock_ctrl.run_create.assert_called_once_with(
6569
config_source="config.yaml",

0 commit comments

Comments
 (0)