Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ def setup(self):
# NOTE: make sure setup isn't called multiple times, otherwise the first jobs/port forwarding will go unused and you'll have to stop them manually
for i in range(self.options["num_scenarios"]):

pbs_launcher = PBS.k4(time=1)
pbs_launcher = PBS.k4(
profile_filename="~/.bashrc", requested_number_of_nodes=1, time=1
)
pbs_launcher.mpiexec = "mpirun"
pbs_launcher.requested_number_of_nodes = 1

Expand Down
21 changes: 18 additions & 3 deletions examples/aerostructural/supersonic_panel/as_opt_remote_serial.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,25 @@ def run_optimization(prob: om.Problem):

def main():
check_totals = False
hpc = "k" # nas or k

pbs = PBS.k4(time=1)
pbs.mpiexec = "mpirun"
pbs.requested_number_of_nodes = 1
if hpc == "nas":

pbs = PBS.nas(
profile_filename="~/.bashrc",
# group_list=None, # add group list here
proc_type="rom",
requested_number_of_nodes=1,
time=1,
)

elif hpc == "k":

pbs = PBS.k4(
profile_filename="~/.bashrc",
requested_number_of_nodes=1,
time=1,
)

prob = om.Problem()
prob.model.add_subsystem(
Expand Down
23 changes: 21 additions & 2 deletions mphys/network/remote_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,20 @@ def initialize(self):
types=bool,
desc="Skip the objective/constraint definition. The quantities will still be added to outputs",
)
self.options.declare(
"stop_server_for_down_time",
default=0,
types=int,
desc="Stop server after evaluation, in case significant down time is expected afterwards. Allows user to conserve HPC "
+ "SBUs in certain applications. 0=never, 1=after first function call, 2=after first derivative call.",
)

@switch_run_directory
def setup(self):
self.var_naming_dot_replacement = self.options["var_naming_dot_replacement"]
self.use_derivative_coloring = self.options["use_derivative_coloring"]
self.derivative_coloring_num = 0
self.stop_server_for_down_time = self.options["stop_server_for_down_time"]

output_dict = None
if self.comm.rank == 0:
Expand Down Expand Up @@ -232,6 +240,17 @@ def evaluate_model(self, remote_input_dict=None, command="initialize"):
else:
self.times_function = np.hstack([self.times_function, model_time_elapsed])

if self.stop_server_for_down_time > 0:
if self.stop_server_for_down_time == 1 or (
self.stop_server_for_down_time == 2
and self._doing_derivative_evaluation(command)
):
if self.comm.rank == 0:
print(
f"CLIENT (subsystem {self.name}): Stopping server's HPC job for down time"
)
self.server_manager.stop_server()

return remote_output_dict

def _assign_objective_partials_from_remote_output(self, remote_dict, partials):
Expand Down Expand Up @@ -450,13 +469,13 @@ def _lower_bound_used(self, bound):
if hasattr(bound, "__len__"):
return (np.array(bound) > -1e20).any()
else:
return bound
return bound > -1e20

def _upper_bound_used(self, bound):
if hasattr(bound, "__len__"):
return (np.array(bound) < 1e20).any()
else:
return bound
return bound < 1e20

def _add_constraints_from_baseline_model(self, output_dict):
for con in output_dict["constraints"].keys():
Expand Down
21 changes: 12 additions & 9 deletions mphys/network/zmq_pbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,16 +123,19 @@ def start_server(self):
self._initialize_connection()
self.server_counter += 1
self._launch_job()
self.server_stopped = False

def stop_server(self):
print(
f"CLIENT (subsystem {self.component_name}): Stopping the remote analysis server",
flush=True,
)
if self.job.state == "R":
self.socket.send("shutdown|null".encode())
self._shutdown_server()
self.socket.close()
if not self.server_stopped:
print(
f"CLIENT (subsystem {self.component_name}): Stopping the remote analysis server",
flush=True,
)
if self.job.state == "R":
self.socket.send("shutdown|null".encode())
self._shutdown_server()
self.socket.close()
self.server_stopped = True

def enough_time_is_remaining(self, estimated_model_time):
self.job.update_job_state()
Expand All @@ -143,7 +146,7 @@ def enough_time_is_remaining(self, estimated_model_time):

def job_has_expired(self):
self.job.update_job_state()
if self.job.state == "R":
if self.job.state == "R" and not self.server_stopped:
return False
else:
if self.job_expiration_max_restarts is not None:
Expand Down
Loading