@@ -186,10 +186,15 @@ class SlurmScriptTemplate(TypedDict):
186186 ],
187187 "write_to_json": [
188188 '\njson_path="{log_dir}/{model_name}.$SLURM_JOB_ID/{model_name}.$SLURM_JOB_ID.json"',
189- 'jq --arg server_addr "$server_address" \\',
190- " '. + {{\"server_address\": $server_addr}}' \\",
191- ' "$json_path" > temp.json \\',
192- ' && mv temp.json "$json_path"',
189+ 'tmp_json="${{json_path}}.tmp.$$"',
190+ "for _attempt in 1 2 3 4 5; do",
191+ ' jq --arg server_addr "$server_address" \\',
192+ " '. + {{\"server_address\": $server_addr}}' \\",
193+ ' "$json_path" > "$tmp_json" \\',
194+ ' && mv "$tmp_json" "$json_path" \\',
195+ " && break",
196+ " sleep 2",
197+ "done",
193198 ],
194199 "launch_cmd": {
195200 "vllm": [
@@ -303,10 +308,15 @@ class BatchModelLaunchScriptTemplate(TypedDict):
303308 "write_to_json": [
304309 "het_job_id=$(($SLURM_JOB_ID+{het_group_id}))",
305310 'json_path="{log_dir}/{slurm_job_name}.$het_job_id/{model_name}.$het_job_id.json"',
306- 'jq --arg server_addr "$server_address" \\',
307- " '. + {{\"server_address\": $server_addr}}' \\",
308- ' "$json_path" > temp_{model_name}.json \\',
309- ' && mv temp_{model_name}.json "$json_path"\n',
311+ 'tmp_json="${{json_path}}.tmp.$$"',
312+ "for _attempt in 1 2 3 4 5; do",
313+ ' jq --arg server_addr "$server_address" \\',
314+ " '. + {{\"server_address\": $server_addr}}' \\",
315+ ' "$json_path" > "$tmp_json" \\',
316+ ' && mv "$tmp_json" "$json_path" \\',
317+ " && break",
318+ " sleep 2",
319+ "done\n",
310320 ],
311321 "container_command": f"{CONTAINER_MODULE_NAME} exec --nv --containall {{image_path}} \\",
312322 "launch_cmd": {
0 commit comments