Skip to content

Commit d87d2ea

Browse files
committed
Added support for multiple provDB clients using shared engine attached to a remote server instance:
AnomalousSendManager instance in ADProvenanceDBclient is now unique to each instance rather than shared RPCs are no longer deregistered on client destruction; this is not necessary anyway Fixed AD simulator connection to in-proc provDB server using incorrect provider index AD simulator now supports connection to a remote provDB server instance Added example run script for ADsim example3 demonstrating use with remote server
1 parent d35405a commit d87d2ea

8 files changed

Lines changed: 182 additions & 14 deletions

File tree

include/chimbuko/ad/ADProvenanceDBclient.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ namespace chimbuko{
8484
sonata::Collection m_coll_normalexecs; /**< The normal executions collection */
8585
bool m_is_connected; /**< True if connection has been established to the provider */
8686

87-
static AnomalousSendManager anom_send_man; /**< Manager for outstanding anomalous requests */
87+
mutable AnomalousSendManager anom_send_man; /**< Manager for outstanding anomalous requests */
8888

8989
int m_rank; /**< MPI rank of current process */
9090
std::string m_server_addr; /**< Address of the server */

sim/include/sim/ad.hpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@ namespace chimbuko_sim{
1414

1515
enum class CommType { Send, Recv };
1616

17+
struct provDBsetup{
18+
bool use_local; /**< Use a local server instance (default true)*/
19+
int remote_server_nshards; /**< Number of database shards on the remote servere (default 1, only applicable for use_local = false)*/
20+
std::string remote_server_addr_dir; /**< Directory where remote server address files are written (only applicable for use_local = false)*/
21+
int remote_server_instances; /**< Number of remote server instances (only applicable for use_local = false)*/
22+
provDBsetup(): use_local(true), remote_server_nshards(1){}
23+
};
24+
1725
//An object that represents a rank of the AD
1826
class ADsim{
1927
std::unordered_map<unsigned long, std::list<ExecData_t> > m_all_execs; /**< Map of thread to execs */
@@ -46,7 +54,7 @@ namespace chimbuko_sim{
4654
* @param program_start Timestamp of program start
4755
* @param step_freq Frequency at which IO steps are to occur
4856
*/
49-
void init(int window_size, int pid, int rid, unsigned long program_start, unsigned long step_freq);
57+
void init(int window_size, int pid, int rid, unsigned long program_start, unsigned long step_freq, const provDBsetup &pdb_setup = provDBsetup());
5058

5159
/**
5260
* @brief Instantiate the AD simulator
@@ -56,8 +64,8 @@ namespace chimbuko_sim{
5664
* @param program_start Timestamp of program start
5765
* @param step_freq Frequency at which IO steps are to occur
5866
*/
59-
ADsim(int window_size, int pid, int rid, unsigned long program_start, unsigned long step_freq): ADsim(){
60-
init(window_size, pid, rid, program_start, step_freq);
67+
ADsim(int window_size, int pid, int rid, unsigned long program_start, unsigned long step_freq, const provDBsetup &pdb_setup = provDBsetup()): ADsim(){
68+
init(window_size, pid, rid, program_start, step_freq, pdb_setup);
6169
}
6270
ADsim(): m_outlier(nullptr), m_net_client(nullptr){}
6371

sim/include/sim/provdb.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ namespace chimbuko_sim{
3030
//Set before first ADsim is created
3131
inline int & nshards(){ static int n=1; return n; }
3232

33+
//Get the local provDBsim object
3334
inline provDBsim & getProvDB(){ static provDBsim pdb(nshards()); return pdb; }
34-
3535
};
3636
#endif

sim/main/example3.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,22 @@ int main(int argc, char **argv){
1414
MPI_Init(&argc, &argv);
1515
#endif
1616

17+
provDBsetup pdb_setup;
18+
int i=1;
19+
while(i<argc){
20+
std::string sarg(argv[i]);
21+
if(sarg == "-remote_provdb"){ //Allow for connection to an existing remote provDB server
22+
if(i+3 >= argc) fatal_error("Not enough arguments provided");
23+
pdb_setup.remote_server_addr_dir = argv[i+1];
24+
pdb_setup.remote_server_nshards = std::stoi(argv[i+2]);
25+
pdb_setup.remote_server_instances = std::stoi(argv[i+3]);
26+
pdb_setup.use_local = false;
27+
i+=4;
28+
}else{
29+
fatal_error(stringize("Unknown argument: %s",argv[i]));
30+
}
31+
}
32+
1733
int window_size = 5; //number of events to record around an anomaly in the provenance data
1834
int pid = 0; //program index
1935
unsigned long program_start = 100;
@@ -25,7 +41,7 @@ int main(int argc, char **argv){
2541

2642
std::vector<ADsim> ad;
2743
for(int r=0;r<n_ranks;r++)
28-
ad.push_back(ADsim(window_size, pid, r, program_start, step_freq));
44+
ad.push_back(ADsim(window_size, pid, r, program_start, step_freq, pdb_setup));
2945

3046
//Setup some functions
3147
registerFunc("main", 500, 50, 100);
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
#Note: This configuration file is sourced into the bash environment for Chimbuko startup scripts, thus the user must follow correct shell conventions
2+
#Please do not remove any of the variables!
3+
4+
#IMPORTANT NOTE: Variables that cannot be left as default are marked as <------------ ***SET ME***
5+
6+
service_node_iface=eth0 #network interface upon which communication to the service node is performed <------------ ***SET ME***
7+
8+
####################################
9+
#Options for visualization module
10+
####################################
11+
use_viz=0 #enable or disable the visualization
12+
viz_root=/opt/chimbuko/viz #the root directory of the visualization module <------------ ***SET ME (if using viz)***
13+
viz_worker_port=6379 #the port on which to run the redis server for the visualization backend
14+
viz_port=5002 #the port on which to run the webserver
15+
export C_FORCE_ROOT=1 #required only for docker runs, allows celery to execute properly as root user <----------------- *** SET ME (if using Docker)
16+
17+
############################################################
18+
#General options for Chimbuko backend (pserver, ad, provdb)
19+
############################################################
20+
backend_root="infer" #The root install directory of the PerformanceAnalysis libraries. If set to "infer" it will be inferred from the path of the executables
21+
chimbuko_services="infer" #The location of the Chimbuko service script. If set to "infer" it will be inferred from backend_root
22+
23+
####################################
24+
#Options for the provenance database
25+
####################################
26+
use_provdb=1 #enable or disable the provDB. If disabled the provenance data will be written as JSON ASCII into the ${provdb_writedir} set below
27+
provdb_extra_args="" #any extra command line arguments to pass
28+
provdb_nshards=4 #number of database shards
29+
provdb_ninstances=1 #number of database server instances. Shards are distributed over instances
30+
provdb_engine="ofi+tcp;ofi_rxm" #the OFI libfabric provider used for the Mochi stack
31+
provdb_port=5000 #the port of the provenance database
32+
provdb_writedir=chimbuko/provdb #the directory in which the provenance database is written. Chimbuko creates chimbuko/provdb which can be used as a default
33+
provdb_commit_freq=10000 #frequency ms at which the provenance database is committed to disk. If set to 0 it will commit only at the end
34+
35+
#With "verbs" provider (used for infiniband, iWarp, etc) we need to also specify the domain, which can be found by running fi_info (on a compute node)
36+
provdb_domain=mlx5_0 #only needed for verbs provider <------------ ***SET ME (if using verbs)***
37+
38+
export FI_UNIVERSE_SIZE=1600 # Defines the expected number of provenance DB clients per instance <------------- *** SET ME (should be larger than the number of clients/instance)
39+
export FI_MR_CACHE_MAX_COUNT=0 # disable MR cache in libfabric; still problematic as of libfabric 1.10.1
40+
export FI_OFI_RXM_USE_SRX=1 # use shared recv context in RXM; should improve scalability
41+
42+
####################################
43+
#Options for the parameter server
44+
####################################
45+
use_pserver=0 #enable or disable the pserver
46+
pserver_extra_args="" #any extra command line arguments to pass
47+
pserver_port=5559 #port for parameter server
48+
pserver_nt=2 #number of worker threads
49+
####################################
50+
#Options for the AD module
51+
####################################
52+
ad_extra_args="-perf_outputpath chimbuko/logs -perf_step 1" #any extra command line arguments to pass. Note: chimbuko/logs is automatically created by services script
53+
ad_win_size=5 #number of events around an anomaly to store; provDB entry size is proportional to this so keep it small!
54+
ad_alg="hbos" #the anomaly detection algorithm. Valid values are "hbos" and "sstd"
55+
ad_outlier_hbos_threshold=0.99 #the percentile of events outside of which are considered anomalies by the HBOS algorithm
56+
ad_outlier_sstd_sigma=12 #number of standard deviations that defines an outlier in the SSTD algorithm
57+
####################################
58+
#Options for TAU
59+
#Note: Only the TAU_ADIOS2_PATH, TAU_ADIOS2_FILE_PREFIX, EXE_NAME and TAU_ADIOS2_ENGINE variables are used by the Chimbuko services script and there only to generate the suggested
60+
# command to launch the AD (output to chimbuko/vars/chimbuko_ad_cmdline.var); they can be overridden by the run script if desired providing the appropriate modifications
61+
# are made to the AD launch command. The remainder of the variables are used only by TAU and can be freely overridden.
62+
####################################
63+
export TAU_ADIOS2_ENGINE=SST #online communication engine (alternative BP4 although this goes through the disk system and may be slower unless the BPfiles are stored on a burst disk)
64+
export TAU_ADIOS2_ONE_FILE=FALSE #a different connection file for each rank
65+
export TAU_ADIOS2_PERIODIC=1 #enable/disable ADIOS2 periodic output
66+
export TAU_ADIOS2_PERIOD=1000000 #period in us between ADIOS2 io steps
67+
export TAU_THREAD_PER_GPU_STREAM=1 #force GPU streams to appear as different TAU virtual threads
68+
export TAU_THROTTLE=0 #enable/disable throttling of short-running functions
69+
70+
export TAU_MAKEFILE=/opt/tau2/x86_64/lib/Makefile.tau-papi-mpi-pthread-pdt-adios2 #The TAU makefile to use <------------ ***SET ME***
71+
72+
#Note: the following 2 variables are not used by the service script but are included here for use from the user's run script allowing the application to be launched with either "${TAU_EXEC} <app>" or "${TAU_PYTHON} <app>"
73+
#Note: the "binding" -T ... is used by Tau to find the appropriate configuration. It can typically be inferred from the name of the Makefile. If using a non-MPI job the 'mpi' should be changed to 'serial' and a non-MPI build of
74+
# ADIOS2/TAU must exist
75+
#Suggestion: It is useful to test the command without Chimbuko first to ensure TAU picks up the correct binding; this can be done by 'export TAU_ADIOS2_ENGINE=BPFile' and then running the application with Tau but without Chimbuko.
76+
TAU_EXEC="tau_exec -T papi,mpi,pthread,pdt,adios2 -adios2_trace" #how to execute tau_exec; the -T arguments should mirror the makefile name <------------ ***SET ME***
77+
TAU_PYTHON="tau_python -T papi,mpi,pthread,pdt,adios2 -tau-python-interpreter=python3 -adios2_trace -tau-python-args=-u" #how to execute tau_python. Note that passing -u to python forces it to not buffer stdout so we can pipe it
78+
#to tee in realtime <--- SET ME (if !python3)
79+
80+
export EXE_NAME=main #the name of the executable (without path) <------------ ***SET ME***
81+
82+
TAU_ADIOS2_PATH=chimbuko/adios2 #path where the adios2 files are to be stored. Chimbuko services creates the directory chimbuko/adios2 in the working directory and this should be used by default
83+
TAU_ADIOS2_FILE_PREFIX=tau-metrics #the prefix of tau adios2 files; full filename is ${TAU_ADIOS2_PREFIX}-${EXE_NAME}-${RANK}.bp
84+
85+
86+
87+
88+
89+
90+
91+
92+
93+
94+
95+
96+
97+
###########################################################################
98+
# NON-USER VARIABLES BELOW = DON'T MODIFY THESE!!
99+
###########################################################################
100+
#Extra processing
101+
export TAU_ADIOS2_FILENAME="${TAU_ADIOS2_PATH}/${TAU_ADIOS2_FILE_PREFIX}"
102+
103+
if [[ ${backend_root} == "infer" ]]; then
104+
if [[ $(which provdb_admin) == "" ]]; then
105+
echo "When inferring the backend root directory, could not find provdb_admin in PATH. Please add your Chimbuko bin directory to PATH"
106+
exit 1
107+
fi
108+
109+
backend_root=$( readlink -f $(which provdb_admin | sed 's/provdb_admin//')/../ )
110+
fi
111+
112+
if [[ ${chimbuko_services} == "infer" ]]; then
113+
chimbuko_services="${backend_root}/scripts/launch/run_services.sh"
114+
if [ ! -f "${chimbuko_services}" ]; then
115+
echo "Could not infer service script location: service script does not exist at ${chimbuko_services}!"
116+
exit 1
117+
fi
118+
fi
119+
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
set -e
3+
4+
rm -rf chimbuko
5+
export CHIMBUKO_CONFIG=chimbuko_config.sh
6+
source ${CHIMBUKO_CONFIG}
7+
8+
if (( 1 )); then
9+
echo "Running services"
10+
${chimbuko_services} 2>&1 | tee services.log &
11+
echo "Waiting"
12+
while [ ! -f chimbuko/vars/chimbuko_ad_cmdline.var ]; do sleep 1; done
13+
ad_cmd=$(cat chimbuko/vars/chimbuko_ad_cmdline.var)
14+
fi
15+
16+
#Run the simulation
17+
if (( 1 )); then
18+
echo "Running sim"
19+
./example3 -remote_provdb chimbuko/provdb ${provdb_nshards} ${provdb_ninstances} 2>&1 | tee chimbuko/logs/sim.log
20+
fi
21+
22+
wait

sim/src/ad.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include<chimbuko/ad/ADNormalEventProvenance.hpp>
55
#include<chimbuko/ad/ADAnomalyProvenance.hpp>
66
#include<chimbuko/util/error.hpp>
7+
#include<chimbuko/provdb/setup.hpp>
78

89
#include<sim/ad.hpp>
910
#include<sim/provdb.hpp>
@@ -41,14 +42,21 @@ ADsim::ADsim(ADsim &&r):
4142
}
4243

4344

44-
void ADsim::init(int window_size, int pid, int rid, unsigned long program_start, unsigned long step_freq){
45+
void ADsim::init(int window_size, int pid, int rid, unsigned long program_start, unsigned long step_freq, const provDBsetup &pdb_setup){
4546
m_window_size = window_size;
4647
m_pid = pid;
4748
m_rid = rid;
4849
#ifdef ENABLE_PROVDB
4950
m_pdb_client.reset(new ADProvenanceDBclient(rid));
50-
m_pdb_client->setEnableHandshake(false);
51-
m_pdb_client->connectSingleServer(getProvDB().getAddr(), getProvDB().getNshards());
51+
52+
if(pdb_setup.use_local){ //use the in-process provDB server
53+
m_pdb_client->setEnableHandshake(false);
54+
int shard = rid % getProvDB().getNshards();
55+
std::string db_name = ProvDBsetup::getShardDBname(shard);
56+
m_pdb_client->connect(getProvDB().getAddr(), db_name, 0); //connect on provider 0 for in-proc server
57+
}else{
58+
m_pdb_client->connectMultiServer(pdb_setup.remote_server_addr_dir, pdb_setup.remote_server_nshards, pdb_setup.remote_server_instances);
59+
}
5260
#else
5361
m_prov_io.reset(new ADio(m_pid, m_rid));
5462
m_prov_io->setDispatcher();

src/ad/ADProvenanceDBclient.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,6 @@ AnomalousSendManager::~AnomalousSendManager(){
7070
verboseStream << "AnomalousSendManager exiting" << std::endl;
7171
}
7272

73-
74-
AnomalousSendManager ADProvenanceDBclient::anom_send_man;
75-
76-
7773
ADProvenanceDBclient::~ADProvenanceDBclient(){
7874
disconnect();
7975
verboseStream << "ADProvenanceDBclient exiting" << std::endl;
@@ -97,7 +93,6 @@ const sonata::Collection & ADProvenanceDBclient::getCollection(const ProvenanceD
9793
}
9894

9995
static void delete_rpc(thallium::remote_procedure* &rpc){
100-
rpc->deregister();
10196
delete rpc; rpc = nullptr;
10297
}
10398

0 commit comments

Comments
 (0)