Skip to content

Commit 9d718e8

Browse files
committed
Added example scripts for launching multiple provDB instances on different Summit nodes
1 parent b2cbb98 commit 9d718e8

5 files changed

Lines changed: 485 additions & 0 deletions

File tree

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
#Note: This configuration file is sourced into the bash environment for Chimbuko startup scripts, thus the user must follow correct shell conventions
2+
#Please do not remove any of the variables!
3+
4+
export HG_LOG_LEVEL=debug
5+
export HG_NA_LOG_LEVEL=error
6+
export FI_UNIVERSE_SIZE=5000
7+
#export FI_LOG_LEVEL=Trace
8+
#export SstVerbose=5
9+
#export MARGO_ENABLE_PROFILING=1
10+
#export MARGO_ENABLE_DIAGNOSTICS=1
11+
#export ABT_PRINT_RAW_STACK=false
12+
#export TAU_VERBOSE=1
13+
#export CHIMBUKO_VERBOSE=1
14+
15+
# disable MR cache in libfabric; still problematic as of libfabric 1.10.1
16+
export FI_MR_CACHE_MAX_COUNT=0
17+
# use shared recv context in RXM; should improve scalability
18+
export FI_OFI_RXM_USE_SRX=1
19+
20+
21+
#IMPORTANT NOTE: Variables that cannot be left as default are marked as <------------ ***SET ME***
22+
23+
service_node_iface=ib0 #network interface upon which communication to the service node is performed <------------ ***SET ME***
24+
25+
####################################
26+
#Options for visualization module
27+
####################################
28+
use_viz=0 #enable or disable the visualization
29+
viz_root=/gpfs/alpine/csc299/proj-shared/ckelly/chimbuko_3_4_21/spack/spack/opt/spack/linux-rhel7-power9le/gcc-9.1.0/chimbuko-visualization2-master-lmzqciw7aluuzkyghi7gmhjhg2qy6bt5
30+
viz_worker_port=6379 #the port on which to run the redis server for the visualization backend
31+
viz_port=5002 #the port on which to run the webserver
32+
#export C_FORCE_ROOT=1 #required only for docker runs, allows celery to execute properly as root user <----------------- *** SET ME (if using Docker)
33+
34+
############################################################
35+
#General options for Chimbuko backend (pserver, ad, provdb)
36+
############################################################
37+
backend_root="infer" #The root install directory of the PerformanceAnalysis libraries. If set to "infer" it will be inferred from the path of the executables
38+
chimbuko_services="infer" #The location of the Chimbuko service script. If set to "infer" it will be inferred from backend_root
39+
40+
####################################
41+
#Options for the provenance database
42+
####################################
43+
use_provdb=1 #enable or disable the provDB. If disabled the provenance data will be written as JSON ASCII into the ${provdb_writedir} set below
44+
provdb_extra_args="" #-db_type unqlite -db_commit_freq 1000000
45+
provdb_nshards=40 #number of database shards
46+
provdb_ninstances=2
47+
#provdb_engine="ofi+tcp;ofi_rxm" #the OFI libfabric provider used for the Mochi stack
48+
provdb_engine="verbs" #the OFI libfabric provider used for the Mochi stack
49+
provdb_port=5000 #the port of the provenance database
50+
provdb_writedir=/mnt/bb/ckelly
51+
provdb_commit_freq=30000 #frequency ms at which the provenance database is committed to disk
52+
53+
#With "verbs" provider (used for infiniband, iWarp, etc) we need to also specify the domain, which can be found by running fi_info (on a compute node)
54+
provdb_domain=mlx5_0 #only needed for verbs provider <------------ ***SET ME (if using verbs)***
55+
56+
57+
####################################
58+
#Options for the parameter server
59+
###################################
60+
use_pserver=0 #enable or disable the pserver
61+
pserver_extra_args="-stat_outputdir chimbuko/viz" #any extra command line arguments to pass
62+
pserver_port=5559 #port for parameter server
63+
pserver_nt=22 #number of worker threads
64+
65+
####################################
66+
#Options for the AD module
67+
####################################
68+
ad_extra_args="-perf_outputpath chimbuko/logs -perf_step 1 -parser_beginstep_timeout 90" #any extra command line arguments to pass. chimbuko/logs is automatically created by services script
69+
ad_win_size=5 #number of events around an anomaly to store; provDB entry size is proportional to this
70+
ad_alg="sstd"
71+
ad_outlier_sstd_sigma=12 #number of standard deviations that defines an outlier
72+
ad_outlier_hbos_threshold=0.99 #the percentile of events outside of which are considered anomalies by the HBOS algorith
73+
74+
####################################
75+
#Options for TAU
76+
#Note: Only the TAU_ADIOS2_PATH, TAU_ADIOS2_FILE_PREFIX, EXE_NAME and TAU_ADIOS2_ENGINE variables are used by the Chimbuko services script and there only to generate the suggested
77+
# command to launch the AD (output to chimbuko/vars/chimbuko_ad_cmdline.var); they can be overridden by the run script if desired providing the appropriate modifications
78+
# are made to the AD launch command. The remainder of the variables are used only by TAU and can be freely overridden.
79+
####################################
80+
export TAU_ADIOS2_ENGINE=BP4 #SST #BP4 #online communication engine (alternative BP4 although this goes through the disk system and may be slower unless the BPfiles are stored on a burst disk)
81+
export TAU_ADIOS2_ONE_FILE=FALSE #a different connection file for each rank
82+
export TAU_ADIOS2_PERIODIC=1 #enable/disable ADIOS2 periodic output
83+
export TAU_ADIOS2_PERIOD=1000000 #period in us between ADIOS2 io steps
84+
export TAU_THREAD_PER_GPU_STREAM=1 #force GPU streams to appear as different TAU virtual threads
85+
export TAU_THROTTLE=1 #enable/disable throttling of short-running functions
86+
87+
export TAU_MAKEFILE=/gpfs/alpine/csc299/proj-shared/ckelly/chimbuko_3_4_21/spack/spack/opt/spack/linux-rhel7-power9le/gcc-9.1.0/tau-git_commit_6_1_21-fvx347belp7prd5csk7s7tuk4dvold7m/lib/Makefile.tau-papi-gnu-mpi-pthread-python-cupti-pdt-adios2
88+
89+
#export TAU_ADIOS2_SELECTION_FILE=/gpfs/alpine/csc299/scratch/ckelly/NWChem/scaling/filter.tau
90+
91+
#Note: the following 2 variables are not used by the service script but are included here for use from the user's run script allowing the application to be launched with either "${TAU_EXEC} <app>" or "${TAU_PYTHON} <app>"
92+
#Note: the "binding" -T ... is used by Tau to find the appropriate configuration. It can typically be inferred from the name of the Makefile. If using a non-MPI job the 'mpi' should be changed to 'serial' and a non-MPI build of
93+
# ADIOS2/TAU must exist
94+
#Suggestion: It is useful to test the command without Chimbuko first to ensure TAU picks up the correct binding; this can be done by 'export TAU_ADIOS2_ENGINE=BPFile' and then running the application with Tau but without Chimbuko.
95+
TAU_EXEC="tau_exec -T papi,mpi,pthread,pdt,adios2 -adios2_trace" #how to execute tau_exec; the -T arguments should mirror the makefile name <------------ ***SET ME***
96+
TAU_PYTHON="tau_python -T papi,mpi,pthread,pdt,adios2 -tau-python-interpreter=python3 -adios2_trace -tau-python-args=-u" #how to execute tau_python. Note that passing -u to python forces it to not buffer stdout so we can pipe it
97+
#to tee in realtime <--- SET ME (if !python3)
98+
99+
export EXE_NAME=nwchem #the name of the executable (without path) <------------ ***SET ME***
100+
101+
TAU_ADIOS2_PATH=/mnt/bb/ckelly #path where the adios2 files are to be stored. Chimbuko services creates the directory chimbuko/adios2 in the working directory and this should be used by default
102+
#TAU_ADIOS2_PATH=chimbuko/adios2 #/mnt/bb/${USER} #path where the adios2 files are to be stored. Chimbuko services creates the directory chimbuko/adios2 in the working directory and this should be used by default
103+
TAU_ADIOS2_FILE_PREFIX=tau-metrics #the prefix of tau adios2 files; full filename is ${TAU_ADIOS2_PREFIX}-${EXE_NAME}-${RANK}.bp
104+
105+
106+
107+
108+
109+
110+
111+
112+
113+
114+
115+
###########################################################################
116+
# NON-USER VARIABLES BELOW = DON'T MODIFY THESE!!
117+
###########################################################################
118+
#Extra processing
119+
export TAU_ADIOS2_FILENAME="${TAU_ADIOS2_PATH}/${TAU_ADIOS2_FILE_PREFIX}"
120+
121+
if [[ ${backend_root} == "infer" ]]; then
122+
backend_root=$( readlink -f $(which provdb_admin | sed 's/provdb_admin//')/../ )
123+
fi
124+
125+
if [[ ${chimbuko_services} == "infer" ]]; then
126+
chimbuko_services="${backend_root}/scripts/launch/run_services.sh"
127+
if [ ! -f "${chimbuko_services}" ]; then
128+
echo "Could not infer service script location: service script does not exist at ${chimbuko_services}!"
129+
exit 1
130+
fi
131+
fi
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
3+
source ${CHIMBUKO_CONFIG}
4+
5+
#Check tau adios2 path is writable; by default this is ${bp_dir} but it can be overridden by users, eg for offline analysis
6+
touch ${TAU_ADIOS2_PATH}/write_check
7+
if [[ $? == 1 ]]; then
8+
echo "Chimbuko Services: Could not write to ADIOS2 output path ${TAU_ADIOS2_PATH}, check permissions"
9+
exit 1
10+
fi
11+
rm -f ${TAU_ADIOS2_PATH}/write_check
12+
13+
if (( ${use_provdb} == 1 )); then
14+
provdb_writedir=$(readlink -f ${provdb_writedir})
15+
rm -f ${provdb_writedir}/provdb.*.unqlite* provider.address*
16+
echo $provdb_writedir > provdb_writedir.log
17+
fi
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
#!/bin/bash
2+
### Begin BSUB Options
3+
#BSUB -P CSC299
4+
#BSUB -J provdb-benchmark
5+
#BSUB -W 00:30
6+
#BSUB -nnodes 97
7+
#BSUB -N
8+
#BSUB -alloc_flags "smt4 pmcd"
9+
#BSUB -alloc_flags "nvme"
10+
### End BSUB Options and begin shell commands
11+
12+
ulimit -c unlimited
13+
set -e
14+
set -o pipefail
15+
16+
BASE=$(pwd)
17+
18+
n_nodes_total=97 #First will be used for services
19+
n_nodes=$(( n_nodes_total - 1 ))
20+
21+
n_mpi_ranks_per_node=42
22+
ncores_per_host_ad=42
23+
24+
#n_mpi_ranks_per_node=20
25+
#ncores_per_host_ad=20
26+
27+
28+
export CHIMBUKO_CONFIG=${BASE}/chimbuko_config.sh
29+
source /autofs/nccs-svm1_home1/ckelly/setup_chimbuko_noAD.sh
30+
source ${CHIMBUKO_CONFIG}
31+
32+
SERVICES=${chimbuko_services}
33+
34+
echo "Job starting"
35+
date
36+
echo "Driver is : " $(which driver)
37+
38+
WORKDIR=/gpfs/alpine/csc299/scratch/ckelly/tmp_benchmark_provdb_run.${LSB_JOBID}
39+
rm -rf $WORKDIR
40+
mkdir -p $WORKDIR
41+
cd $WORKDIR
42+
43+
${BASE}/gen_urs.pl ${n_nodes_total} ${n_mpi_ranks_per_node} 1 0 ${provdb_ninstances};
44+
45+
cp ${BASE}/run_services*.sh .
46+
cp ${BASE}/path_check.sh .
47+
48+
#Generate ERF file for head node and AD/main
49+
#rm -f services.erf ad.erf main.erf
50+
#${BASE}/gen_erf.pl ${n_nodes_total} ${n_mpi_ranks_per_node} 0 0 ${ncores_per_host_ad};
51+
52+
if (( 1 )); then
53+
echo "Launching services"
54+
55+
#Run the services
56+
rm -f chimbuko/vars/chimbuko_ad_cmd.var
57+
./run_services.sh &
58+
59+
#Wait for the services to start and generate their outputs
60+
while [ ! -f chimbuko/vars/chimbuko_ad_cmdline.var ]; do sleep 1; done
61+
fi
62+
63+
provdb_addr_dir=chimbuko/provdb
64+
exe=/autofs/nccs-svm1_home1/ckelly/bld/AD/benchmark_suite/benchmark_provdb/benchmark_client
65+
cycles=500
66+
callstack_size=2
67+
ncounters=5
68+
winsize=5
69+
comm_messages_per_winevent=1
70+
anomalies_per_cycle=1
71+
normal_events_per_cycle=1
72+
cycle_time_ms=1000
73+
nshards=${provdb_nshards}
74+
ninstances=${provdb_ninstances}
75+
perf_write_freq=5 #cycles
76+
perf_dir=chimbuko/logs
77+
do_state_dump=0
78+
# -max_outstanding_sends 50
79+
80+
client_cmd="$exe ${provdb_addr_dir} -cycles ${cycles} -callstack_size ${callstack_size} -ncounters ${ncounters} -winsize ${winsize} -comm_messages_per_winevent ${comm_messages_per_winevent} -anomalies_per_cycle ${anomalies_per_cycle} -normal_events_per_cycle ${normal_events_per_cycle} -cycle_time_ms ${cycle_time_ms} -nshards ${nshards} -ninstances ${ninstances} -perf_write_freq ${perf_write_freq} -perf_dir ${perf_dir} -do_state_dump ${do_state_dump} 2>&1 | tee chimbuko/logs/client.log"
81+
82+
if (( 1 )); then
83+
echo "Instantiating client"
84+
echo "Command is " $client_cmd
85+
#eval "jsrun --erf_input=ad.erf -e prepended ${client_cmd} &"
86+
eval "jsrun -U main.urs -e prepended ${client_cmd} &"
87+
fi
88+
89+
echo "Waiting for job completion"
90+
jswait all
91+
92+
echo "Copying back chimbuko dir (minus the actual provdb)"
93+
rm -rf chimbuko/provdb/*.unqlite
94+
outputdir=${BASE}/chimbuko.${LSB_JOBID}
95+
mv chimbuko ${outputdir}
96+
cp ${BASE}/run_main.sh ${BASE}/chimbuko_config.sh ${outputdir}
97+
98+
echo "Done"
99+
date

0 commit comments

Comments
 (0)