diff --git a/examples/aerostructural/supersonic_panel/as_opt_remote_parallel.py b/examples/aerostructural/supersonic_panel/as_opt_remote_parallel.py index 4c245a1d..6f4208e7 100644 --- a/examples/aerostructural/supersonic_panel/as_opt_remote_parallel.py +++ b/examples/aerostructural/supersonic_panel/as_opt_remote_parallel.py @@ -14,7 +14,9 @@ def setup(self): # NOTE: make sure setup isn't called multiple times, otherwise the first jobs/port forwarding will go unused and you'll have to stop them manually for i in range(self.options["num_scenarios"]): - pbs_launcher = PBS.k4(time=1) + pbs_launcher = PBS.k4( + profile_filename="~/.bashrc", requested_number_of_nodes=1, time=1 + ) pbs_launcher.mpiexec = "mpirun" pbs_launcher.requested_number_of_nodes = 1 diff --git a/examples/aerostructural/supersonic_panel/as_opt_remote_serial.py b/examples/aerostructural/supersonic_panel/as_opt_remote_serial.py index 6b14f0f1..09a147e2 100644 --- a/examples/aerostructural/supersonic_panel/as_opt_remote_serial.py +++ b/examples/aerostructural/supersonic_panel/as_opt_remote_serial.py @@ -60,10 +60,25 @@ def run_optimization(prob: om.Problem): def main(): check_totals = False + hpc = "k" # nas or k - pbs = PBS.k4(time=1) - pbs.mpiexec = "mpirun" - pbs.requested_number_of_nodes = 1 + if hpc == "nas": + + pbs = PBS.nas( + profile_filename="~/.bashrc", + # group_list=None, # add group list here + proc_type="rom", + requested_number_of_nodes=1, + time=1, + ) + + elif hpc == "k": + + pbs = PBS.k4( + profile_filename="~/.bashrc", + requested_number_of_nodes=1, + time=1, + ) prob = om.Problem() prob.model.add_subsystem( diff --git a/mphys/network/remote_component.py b/mphys/network/remote_component.py index b0468b4a..bfb26bf1 100644 --- a/mphys/network/remote_component.py +++ b/mphys/network/remote_component.py @@ -112,12 +112,20 @@ def initialize(self): types=bool, desc="Skip the objective/constraint definition. The quantities will still be added to outputs", ) + self.options.declare( + "stop_server_for_down_time", + default=0, + types=int, + desc="Stop server after evaluation, in case significant down time is expected afterwards. Allows user to conserve HPC " + + "SBUs in certain applications. 0=never, 1=after first function call, 2=after first derivative call.", + ) @switch_run_directory def setup(self): self.var_naming_dot_replacement = self.options["var_naming_dot_replacement"] self.use_derivative_coloring = self.options["use_derivative_coloring"] self.derivative_coloring_num = 0 + self.stop_server_for_down_time = self.options["stop_server_for_down_time"] output_dict = None if self.comm.rank == 0: @@ -232,6 +240,17 @@ def evaluate_model(self, remote_input_dict=None, command="initialize"): else: self.times_function = np.hstack([self.times_function, model_time_elapsed]) + if self.stop_server_for_down_time > 0: + if self.stop_server_for_down_time == 1 or ( + self.stop_server_for_down_time == 2 + and self._doing_derivative_evaluation(command) + ): + if self.comm.rank == 0: + print( + f"CLIENT (subsystem {self.name}): Stopping server's HPC job for down time" + ) + self.server_manager.stop_server() + return remote_output_dict def _assign_objective_partials_from_remote_output(self, remote_dict, partials): @@ -450,13 +469,13 @@ def _lower_bound_used(self, bound): if hasattr(bound, "__len__"): return (np.array(bound) > -1e20).any() else: - return bound + return bound > -1e20 def _upper_bound_used(self, bound): if hasattr(bound, "__len__"): return (np.array(bound) < 1e20).any() else: - return bound + return bound < 1e20 def _add_constraints_from_baseline_model(self, output_dict): for con in output_dict["constraints"].keys(): diff --git a/mphys/network/zmq_pbs.py b/mphys/network/zmq_pbs.py index 9a718a6b..c52b72bf 100644 --- a/mphys/network/zmq_pbs.py +++ b/mphys/network/zmq_pbs.py @@ -123,16 +123,19 @@ def start_server(self): self._initialize_connection() self.server_counter += 1 self._launch_job() + self.server_stopped = False def stop_server(self): - print( - f"CLIENT (subsystem {self.component_name}): Stopping the remote analysis server", - flush=True, - ) - if self.job.state == "R": - self.socket.send("shutdown|null".encode()) - self._shutdown_server() - self.socket.close() + if not self.server_stopped: + print( + f"CLIENT (subsystem {self.component_name}): Stopping the remote analysis server", + flush=True, + ) + if self.job.state == "R": + self.socket.send("shutdown|null".encode()) + self._shutdown_server() + self.socket.close() + self.server_stopped = True def enough_time_is_remaining(self, estimated_model_time): self.job.update_job_state() @@ -143,7 +146,7 @@ def enough_time_is_remaining(self, estimated_model_time): def job_has_expired(self): self.job.update_job_state() - if self.job.state == "R": + if self.job.state == "R" and not self.server_stopped: return False else: if self.job_expiration_max_restarts is not None: