Skip to content

Commit 1f868f6

Browse files
committed
Merge branch 'sm_release' into ckelly_develop
2 parents be8b0b2 + 59cd9b4 commit 1f868f6

31 files changed

Lines changed: 561 additions & 243 deletions

app/Makefile.am

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/3rdparty @PS_FLAGS@
2-
LDADD = ../src/libchimbuko.la -lstdc++fs
2+
LDADD = ../src/libchimbuko.la -lstdc++fs
33

44
bin_PROGRAMS = driver pclient pclient_stats hpserver pserver pshutdown sstSinker sst_view bpfile_replay
55

app/driver.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ optionalArgsParser & getOptionalArgsParser(){
7171
addOptionalCommandLineArg(p, hbos_use_global_threshold, "Set true to use a global threshold in HBOS algorithm. Dafault is true.");
7272
addOptionalCommandLineArg(p, program_idx, "Set the index associated with the instrumented program. Use to label components of a workflow. (default 0)");
7373
addOptionalCommandLineArg(p, outlier_sigma, "Set the number of standard deviations that defines an anomalous event (default 6)");
74+
addOptionalCommandLineArg(p, net_recv_timeout, "Timeout (in ms) for blocking receives on client from parameter server");
7475
addOptionalCommandLineArg(p, pserver_addr, "Set the address of the parameter server. If empty (default) the pserver will not be used.");
7576
addOptionalCommandLineArg(p, hpserver_nthr, "Set the number of threads used by the hierarchical PS. This parameter is used to compute a port offset for the particular endpoint that this AD rank connects to (default 1)");
7677
addOptionalCommandLineArg(p, interval_msec, "Force the AD to pause for this number of ms at the end of each IO step (default 0)");
@@ -129,6 +130,7 @@ ChimbukoParams getParamsFromCommandLine(int argc, char** argv, const int mpi_wor
129130
params.ad_algorithm = "hbos";
130131
params.hbos_threshold = 0.99;
131132
params.hbos_use_global_threshold = true;
133+
params.net_recv_timeout = 30000;
132134
params.pserver_addr = ""; //don't use pserver by default
133135
params.hpserver_nthr = 1;
134136
params.outlier_sigma = 6.0; // anomaly detection algorithm parameter

app/provdb_admin.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ int main(int argc, char** argv) {
147147
progressStream << "ProvDB Admin: Enabling verbose debug output" << std::endl;
148148
enableVerboseLogging() = true;
149149
spdlog::set_level(spdlog::level::trace); //enable logging of Sonata
150-
}
150+
}
151151

152152
//argv[1] should specify the ip address and port (the only way to fix the port that I'm aware of)
153153
//Should be of form <ip address>:<port> eg. 127.0.0.1:1234
@@ -239,7 +239,7 @@ int main(int argc, char** argv) {
239239

240240
//Initialize provider
241241
sonata::Provider provider(engine, 0);
242-
242+
243243
progressStream << "ProvDB Admin: Provider is running on " << addr << std::endl;
244244

245245
{ //Scope in which admin object is active
@@ -269,7 +269,7 @@ int main(int argc, char** argv) {
269269
//Create the collections
270270
{ //scope in which client is active
271271
sonata::Client client(engine);
272-
272+
273273
//Initialize the provdb shards
274274
std::vector<sonata::Database> db(args.nshards);
275275
for(int s=0;s<args.nshards;s++){
@@ -297,7 +297,7 @@ int main(int argc, char** argv) {
297297
Clock::time_point commit_timer_start = Clock::now();
298298

299299
//Spin quietly until SIGTERM sent
300-
signal(SIGTERM, termSignalHandler);
300+
signal(SIGTERM, termSignalHandler);
301301
progressStream << "ProvDB Admin: main thread waiting for completion" << std::endl;
302302
while(!stop_wait_loop) { //stop wait loop will be set by SIGTERM handler
303303
tl::thread::sleep(engine, 1000); //Thallium engine sleeps but listens for rpc requests
@@ -313,13 +313,13 @@ int main(int argc, char** argv) {
313313
glob_db.commit();
314314
commit_timer_start = Clock::now();
315315
}
316-
316+
317317
//If at least one client has previously connected but none are now connected, shutdown the server
318318
//If all clients disconnected we must also wait for the pserver to disconnect (if it is connected)
319319

320320
//If args.autoshutdown is disabled we can force shutdown via a "stop_server" RPC
321321
if(
322-
(args.autoshutdown || cmd_shutdown) &&
322+
(args.autoshutdown || cmd_shutdown) &&
323323
( a_client_has_connected && connected.size() == 0 ) &&
324324
( !pserver_has_connected || (pserver_has_connected && !pserver_connected) ) &&
325325
( !committer_has_connected || (committer_has_connected && !committer_connected) )
@@ -336,7 +336,7 @@ int main(int argc, char** argv) {
336336
progressStream << "ProvDB Admin: destroying pserver database as it didn't connect (connection is optional)" << std::endl;
337337
admin.destroyDatabase(addr, 0, glob_db_name);
338338
}
339-
339+
340340
progressStream << "ProvDB Admin: ending admin scope" << std::endl;
341341
margo_dump("margo_dump_end_admin_scope");
342342
}//admin scope
@@ -346,10 +346,10 @@ int main(int argc, char** argv) {
346346
}//provider scope
347347

348348
progressStream << "ProvDB Admin: shutting down server engine" << std::endl;
349-
delete mtx; //delete mutex prior to engine finalize
349+
delete mtx; //delete mutex prior to engine finalize
350350
engine.finalize();
351-
progressStream << "ProvDB Admin: finished, exiting engine scope" << std::endl;
351+
progressStream << "ProvDB Admin: finished, exiting engine scope" << std::endl;
352352
}
353-
progressStream << "ProvDB Admin: finished, exiting main scope" << std::endl;
353+
progressStream << "ProvDB Admin: finished, exiting main scope" << std::endl;
354354
return 0;
355355
}

app/sstSinker.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@ int main(int argc, char ** argv){
2020

2121
int world_rank, world_size;
2222
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
23-
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
23+
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
2424

2525
bool error = false;
26-
try
26+
try
2727
{
2828
commandLineParser<SinkerArgs> cmdline;
2929
addOptionalCommandLineArg(cmdline, timeout, "Specify the SST connect timeout in seconds (Default 60s)");
@@ -35,19 +35,19 @@ int main(int argc, char ** argv){
3535
<< "Options:" << std::endl;
3636
cmdline.help(std::cout);
3737
return 0;
38-
}
38+
}
3939

4040
std::string engineType = argv[1]; // BPFile or SST
4141
std::string data_dir = argv[2]; // *.bp location
4242
std::string prefix = argv[3]; // "tau-metrics-nwchem"
4343
std::string inputFile = prefix + "-" + std::to_string(world_rank) + ".bp";
4444
int fetch_data = atoi(argv[4]);
45-
45+
4646
SinkerArgs args;
4747
cmdline.parse(args, argc-5, (const char**)(argv+5) );
4848

4949
if (world_rank == 0) {
50-
std::cout << "\n"
50+
std::cout << "\n"
5151
<< "rank : " << world_rank << "\n"
5252
<< "Engine : " << engineType << "\n"
5353
<< "BP in dir : " << data_dir << "\n"
@@ -62,7 +62,7 @@ int main(int argc, char ** argv){
6262
// -----------------------------------------------------------------------
6363
ADParser * parser;
6464

65-
// int step = 0;
65+
// int step = 0;
6666

6767
// -----------------------------------------------------------------------
6868
// Measurement variables
@@ -92,7 +92,7 @@ int main(int argc, char ** argv){
9292
if (!parser->getStatus())
9393
{
9494
// No more steps available.
95-
break;
95+
break;
9696
}
9797

9898
// step = parser->getCurrentStep();
@@ -128,7 +128,7 @@ int main(int argc, char ** argv){
128128
total_processing_time = global_measures[0];
129129
total_frames = global_measures[1];
130130
}
131-
131+
132132
if (false && world_rank == 0) {
133133
std::cout << "\n"
134134
<< "Avg. num. frames : " << (double)total_frames/(double)world_size << "\n"

docker/ubuntu18.04/openmpi4.0.4/Dockerfile.mochi

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,3 @@ WORKDIR /
2222

2323
ENTRYPOINT [ "/bin/bash" ]
2424
#To setup spack you need to run "source /spack/spack/share/spack/setup-env.sh" inside your image
25-

include/chimbuko/ad/ADLocalCounterStatistics.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ namespace chimbuko{
8383
*
8484
* The message string is the output of net_serialize()
8585
*/
86-
std::pair<size_t, size_t> updateGlobalStatistics(ADNetClient &net_client) const;
86+
std::pair<size_t, size_t> updateGlobalStatistics(ADThreadNetClient &net_client) const;
8787

8888
/**
8989
* @brief Attach a PerfStats object into which performance metrics are accumulated
@@ -149,7 +149,7 @@ namespace chimbuko{
149149
* @param step step (or frame) number
150150
* @return std::pair<size_t, size_t> [sent, recv] message size
151151
*/
152-
static std::pair<size_t, size_t> updateGlobalStatistics(ADNetClient &net_client, const std::string &l_stats, int step);
152+
static std::pair<size_t, size_t> updateGlobalStatistics(ADThreadNetClient &net_client, const std::string &l_stats, int step);
153153

154154
unsigned long m_program_idx; /**< Program idx*/
155155
int m_step; /**< io step */

include/chimbuko/ad/ADLocalFuncStatistics.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ namespace chimbuko{
134134
*
135135
* The message communicated is the string dump of the output of get_json_state()
136136
*/
137-
std::pair<size_t, size_t> updateGlobalStatistics(ADNetClient &net_client) const;
137+
std::pair<size_t, size_t> updateGlobalStatistics(ADThreadNetClient &net_client) const;
138138

139139

140140
/**
@@ -206,7 +206,7 @@ namespace chimbuko{
206206
* @param step step (or frame) number
207207
* @return std::pair<size_t, size_t> [sent, recv] message size
208208
*/
209-
static std::pair<size_t, size_t> updateGlobalStatistics(ADNetClient &net_client, const std::string &l_stats, int step);
209+
static std::pair<size_t, size_t> updateGlobalStatistics(ADThreadNetClient &net_client, const std::string &l_stats, int step);
210210

211211
AnomalyData m_anom_data; /**< AnomalyData instance holding information about the anomalies */
212212
std::unordered_map<unsigned long, FuncStats> m_funcstats; /**< map of function index to function profile statistics */

0 commit comments

Comments
 (0)