99 check_plot_dependency_graph ,
1010 check_pmi ,
1111 check_refresh_rate ,
12+ check_restart_limit ,
1213 check_wait_on_shutdown ,
1314 validate_number_of_cores ,
1415)
@@ -70,6 +71,7 @@ class FluxJobExecutor(BaseExecutor):
7071 export_workflow_filename (str): Name of the file to store the exported workflow graph in.
7172 log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
7273 wait (bool): Whether to wait for the completion of all tasks before shutting down the executor.
74+ restart_limit (int): The maximum number of restarting worker processes.
7375 openmpi_oversubscribe (bool): adds the `--oversubscribe` command flag (OpenMPI and SLURM) - default False
7476
7577 Examples:
@@ -113,6 +115,7 @@ def __init__(
113115 export_workflow_filename : Optional [str ] = None ,
114116 log_obj_size : bool = False ,
115117 wait : bool = True ,
118+ restart_limit : int = 0 ,
116119 openmpi_oversubscribe : bool = False ,
117120 ):
118121 """
@@ -164,6 +167,7 @@ def __init__(
164167 export_workflow_filename (str): Name of the file to store the exported workflow graph in.
165168 log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
166169 wait (bool): Whether to wait for the completion of all tasks before shutting down the executor.
170+ restart_limit (int): The maximum number of restarting worker processes.
167171 openmpi_oversubscribe (bool): adds the `--oversubscribe` command flag (OpenMPI and SLURM) - default False
168172
169173 """
@@ -180,6 +184,9 @@ def __init__(
180184 resource_dict .update (
181185 {k : v for k , v in default_resource_dict .items () if k not in resource_dict }
182186 )
187+ check_restart_limit (
188+ restart_limit = restart_limit , block_allocation = block_allocation
189+ )
183190 if not disable_dependencies :
184191 super ().__init__ (
185192 executor = DependencyTaskScheduler (
@@ -197,6 +204,7 @@ def __init__(
197204 init_function = init_function ,
198205 log_obj_size = log_obj_size ,
199206 wait = wait ,
207+ restart_limit = restart_limit ,
200208 ),
201209 max_cores = max_cores ,
202210 refresh_rate = refresh_rate ,
@@ -223,6 +231,7 @@ def __init__(
223231 init_function = init_function ,
224232 log_obj_size = log_obj_size ,
225233 wait = wait ,
234+ restart_limit = restart_limit ,
226235 )
227236 )
228237
@@ -464,6 +473,7 @@ def create_flux_executor(
464473 init_function : Optional [Callable ] = None ,
465474 log_obj_size : bool = False ,
466475 wait : bool = True ,
476+ restart_limit : int = 0 ,
467477) -> Union [OneProcessTaskScheduler , BlockAllocationTaskScheduler ]:
468478 """
469479 Create a flux executor
@@ -504,6 +514,7 @@ def create_flux_executor(
504514 init_function (None): optional function to preset arguments for functions which are submitted later
505515 log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
506516 wait (bool): Whether to wait for the completion of all tasks before shutting down the executor.
517+ restart_limit (int): The maximum number of restarting worker processes.
507518
508519 Returns:
509520 InteractiveStepExecutor/ InteractiveExecutor
@@ -551,6 +562,7 @@ def create_flux_executor(
551562 max_workers = max_workers ,
552563 executor_kwargs = resource_dict ,
553564 spawner = FluxPythonSpawner ,
565+ restart_limit = restart_limit ,
554566 )
555567 else :
556568 return OneProcessTaskScheduler (
0 commit comments