Skip to content

Commit 7dae3a7

Browse files
committed
Added unit tests for ADAnomalyProvenance::getProvenanceEntries
To ADAnomalyProvenance, added the ability to set a minimum runtime for anomalies recorded to the provenance database + unit test Integrated the above into main Chimbuko class Added cmdline options to set the anomaly minimum runtime for provenance output to driver and driver_multirank
1 parent 3536b18 commit 7dae3a7

7 files changed

Lines changed: 126 additions & 7 deletions

File tree

app/driver.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ optionalArgsParser & getOptionalArgsParser(){
113113
addOptionalCommandLineArg(p, step_report_freq, "Set the steps between Chimbuko reporting IO step progress. Use 0 to deactivate this logging entirely (default 1)");
114114
addOptionalCommandLineArg(p, prov_record_startstep, "If != -1, the IO step on which to start recording provenance information for anomalies (for testing, default -1)");
115115
addOptionalCommandLineArg(p, prov_record_stopstep, "If != -1, the IO step on which to stop recording provenance information for anomalies (for testing, default -1)");
116+
addOptionalCommandLineArg(p, prov_min_anom_time, "Set the minimum exclusive runtime (in microseconds) for anomalies to recorded in the provenance output (default 0)");
117+
116118
addOptionalCommandLineArg(p, analysis_step_freq, "Set the frequency in IO steps between analyzing the data. Data will be accumulated over intermediate steps. (default 1)");
117119
addOptionalCommandLineArg(p, monitoring_watchlist_file, "Provide a filename containing the counter watchlist for the integration with the monitoring plugin. Empty string (default) uses the default subset. File format is JSON: \"[ [<COUNTER NAME>, <FIELD NAME>], ... ]\" where COUNTER NAME is the name of the counter in the input data stream and FIELD NAME the name of the counter in the provenance output.");
118120
addOptionalCommandLineArg(p, monitoring_counter_prefix, "Provide an optional prefix marking a set of monitoring plugin counters to be captured, on top of or superseding the watchlist. Empty string (default) is ignored.");

app/driver_multirank.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ optionalArgsParser & getOptionalArgsParser(){
8484
addOptionalCommandLineArg(p, step_report_freq, "Set the steps between Chimbuko reporting IO step progress. Use 0 to deactivate this logging entirely (default 1)");
8585
addOptionalCommandLineArg(p, prov_record_startstep, "If != -1, the IO step on which to start recording provenance information for anomalies (for testing, default -1)");
8686
addOptionalCommandLineArg(p, prov_record_stopstep, "If != -1, the IO step on which to stop recording provenance information for anomalies (for testing, default -1)");
87+
addOptionalCommandLineArg(p, prov_min_anom_time, "Set the minimum exclusive runtime (in microseconds) for anomalies to recorded in the provenance output (default 0)");
8788
addOptionalCommandLineArg(p, analysis_step_freq, "Set the frequency in IO steps between analyzing the data. Data will be accumulated over intermediate steps. (default 1)");
8889
addOptionalCommandLineArg(p, read_ignored_corrid_funcs, "Set path to a file containing functions (one per line) for which the correlation ID counter should be ignored. If an empty string (default) no IDs will be ignored");
8990
addOptionalCommandLineArg(p, monitoring_watchlist_file, "Provide a filename containing the counter watchlist for the integration with the monitoring plugin. Empty string (default) uses the default subset. File format is JSON: \"[ [<COUNTER NAME>, <FIELD NAME>], ... ]\" where COUNTER NAME is the name of the counter in the input data stream and FIELD NAME the name of the counter in the provenance output.");

include/chimbuko/ad/ADAnomalyProvenance.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ namespace chimbuko{
4141
* @param step The io step
4242
* @param first_event_ts The timestamp of the first event in the io step
4343
* @param last_event_ts The timestamp of the last event in the io step
44+
*
45+
* Note the minimum anomaly time for recorded data does not apply to this call
4446
*/
4547
nlohmann::json getEventProvenance(const ExecData_t &call,
4648
const int step,
@@ -52,6 +54,13 @@ namespace chimbuko{
5254
*/
5355
void setWindowSize(const int sz){ m_window_size = sz; }
5456

57+
/**
58+
* @brief Set the minimum exclusive runtime (in microseconds) for recorded anomalies (default 0)
59+
*
60+
* Anomalies with exclusive runtime less than this will not have their data recorded
61+
*/
62+
void setMinimumAnomalyTime(const unsigned long to){ m_min_anom_time = to; }
63+
5564
/**
5665
* @brief If linked, performance information will be gathered
5766
*/
@@ -135,6 +144,7 @@ namespace chimbuko{
135144
int m_window_size; /**< The number of events either side of the anomaly to capture in the window*/
136145
ADMonitoring const *m_monitoring; /**< Node state information from TAU's monitoring plugin*/
137146
ParamInterface const *m_algo_params; /**< The algorithm parameters*/
147+
unsigned long m_min_anom_time; /**< Anomalies with exclusive runtime less than this will not have their data recorded*/
138148

139149
ADNormalEventProvenance m_normalevents; /**< Maintain information on a selection of normal events*/
140150
};

include/chimbuko/chimbuko.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ namespace chimbuko {
4848
#endif
4949
int prov_record_startstep; /**< If != -1, the IO step on which to start recording provenance information for anomalies */
5050
int prov_record_stopstep; /**< If != -1, the IO step on which to stop recording provenance information for anomalies */
51+
unsigned long prov_min_anom_time; /**< The minimum exclusive runtime (in microseconds) for anomalies recorded in the provenance output (default 0) */
5152

5253
unsigned int anom_win_size; /**< When anomaly data are recorded, a window of this size (in units of events) around the anomalous event are also recorded (used both for viz and provDB)*/
5354

src/ad/ADAnomalyProvenance.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
using namespace chimbuko;
77

8-
ADAnomalyProvenance::ADAnomalyProvenance(const ADEventIDmap &event_man): m_perf(nullptr), m_event_man(&event_man), m_monitoring(nullptr), m_metadata(nullptr), m_algo_params(nullptr), m_window_size(5){}
8+
ADAnomalyProvenance::ADAnomalyProvenance(const ADEventIDmap &event_man): m_perf(nullptr), m_event_man(&event_man), m_monitoring(nullptr), m_metadata(nullptr), m_algo_params(nullptr), m_window_size(5), m_min_anom_time(0){}
99

1010

1111
inline nlohmann::json getCallStackEntry(const ExecData_t &call){
@@ -225,23 +225,24 @@ void ADAnomalyProvenance::getProvenanceEntries(std::vector<nlohmann::json> &anom
225225

226226
//Gather provenance of anomalies and for each one try to obtain a normal execution
227227
timer.start();
228-
anom_event_entries.resize(anomalies.nEvents(Anomalies::EventType::Outlier));
229-
size_t i=0;
230228
std::unordered_set<unsigned long> normal_event_fids;
231229

232230
for(auto anom_it : anomalies.allEvents(Anomalies::EventType::Outlier)){
233231
timer2.start();
234-
anom_event_entries[i++] = getEventProvenance(*anom_it, step, first_event_ts, last_event_ts);
232+
if(anom_it->get_exclusive() < m_min_anom_time) continue; //skip executions with too short runtimes to avoid filling the database with irrelevant anomalies
233+
234+
anom_event_entries.push_back(getEventProvenance(*anom_it, step, first_event_ts, last_event_ts));
235235
if(m_perf) m_perf->add("ad_extract_send_prov_anom_data_generation_per_anom_ms", timer2.elapsed_ms());
236236

237237
//Get the associated normal event if one has not been recorded for this function on this step
238238
if(!normal_event_fids.count(anom_it->get_fid())){
239+
timer2.start();
239240
//if normal event not available put into the list of outstanding requests and it will be recorded next time a normal event for this function is obtained
240241
//if normal event is available, delete internal copy within m_normalevent_prov so the normal event isn't added more than once
241-
timer2.start();
242242
auto nev = m_normalevents.getNormalEvent(anom_it->get_pid(), anom_it->get_rid(), anom_it->get_tid(), anom_it->get_fid(), add_outstanding, do_delete);
243243
if(nev.second) normal_event_entries.push_back(std::move(nev.first));
244-
normal_event_fids.insert(anom_it->get_fid());
244+
245+
normal_event_fids.insert(anom_it->get_fid()); //make sure we don't record more than one normal event for this fid
245246
if(m_perf) m_perf->add("ad_extract_send_prov_normalevent_gather_per_anom_ms", timer2.elapsed_ms());
246247
}
247248

src/chimbuko.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ ChimbukoParams::ChimbukoParams(): rank(-1234), //not set!
3737
func_threshold_file(""),
3838
ignored_func_file(""),
3939
monitoring_watchlist_file(""),
40-
monitoring_counter_prefix("")
40+
monitoring_counter_prefix(""),
41+
prov_min_anom_time(0)
4142
{}
4243

4344
void ChimbukoParams::print() const{
@@ -316,6 +317,7 @@ void Chimbuko::init_provenance_gatherer(){
316317
m_anomaly_provenance->linkMonitoring(m_monitoring);
317318
m_anomaly_provenance->linkMetadata(m_metadata_parser);
318319
m_anomaly_provenance->setWindowSize(m_params.anom_win_size);
320+
m_anomaly_provenance->setMinimumAnomalyTime(m_params.prov_min_anom_time);
319321
m_ptr_registry.registerPointer(m_anomaly_provenance);
320322
}
321323

test/unit_tests/ad/ADAnomalyProvenance.cpp

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,3 +499,105 @@ TEST(TestADAnomalyProvenance, extractsHostname){
499499
nlohmann::json output = prov.getEventProvenance(exec1,8,800,1200);
500500
EXPECT_EQ(output["hostname"], "TheHost");
501501
}
502+
503+
504+
template<typename T>
505+
T & listEntry(std::list<T> &l, int i){
506+
auto it = std::next(l.begin(),i);
507+
return *it;
508+
}
509+
510+
TEST(TestADAnomalyProvenance, getProvenanceEntries){
511+
std::vector<ExecData_t> events = {
512+
createFuncExecData_t(1,2,3, 55, "theparent", 800, 200),
513+
createFuncExecData_t(1,2,3, 33, "thefunc", 900, 100),
514+
createFuncExecData_t(1,2,4, 11, "theotherparent", 1100, 200),
515+
createFuncExecData_t(1,2,4, 22, "theotherfunc", 1150, 50)
516+
};
517+
bindParentChild(events[0],events[1]);
518+
bindParentChild(events[2],events[3]);
519+
events[1].set_label(-1);
520+
events[3].set_label(-1);
521+
522+
ADEvent event_man;
523+
std::vector<CallListIterator_t> event_its;
524+
for(auto &e : events) event_its.push_back(event_man.addCall(e));
525+
526+
Anomalies anoms;
527+
anoms.insert(event_its[1],Anomalies::EventType::Outlier);
528+
anoms.insert(event_its[3],Anomalies::EventType::Outlier);
529+
530+
std::vector<nlohmann::json> anom_entries, normal_entries;
531+
{
532+
ADAnomalyProvenance prov(event_man);
533+
prov.getProvenanceEntries(anom_entries, normal_entries, anoms, 0, 800, 1200);
534+
}
535+
536+
ASSERT_EQ(anom_entries.size(),2);
537+
ASSERT_EQ(normal_entries.size(),0); //didn't put in normal events
538+
539+
std::string got, expect;
540+
541+
got = anom_entries[0]["event_id"]; expect = event_its[1]->get_id().toString();
542+
EXPECT_EQ(got,expect);
543+
got = anom_entries[1]["event_id"]; expect = event_its[3]->get_id().toString();
544+
EXPECT_EQ(got,expect);
545+
546+
//Repeat but add normal events for both
547+
events.push_back( createFuncExecData_t(1,2,3, 33, "thefunc", 700, 50) );
548+
events.push_back( createFuncExecData_t(1,2,4, 22, "theotherfunc", 400, 25) );
549+
event_its.push_back(event_man.addCall(events[4]));
550+
event_its.push_back(event_man.addCall(events[5]));
551+
anoms.insert(event_its[4],Anomalies::EventType::Normal);
552+
anoms.insert(event_its[5],Anomalies::EventType::Normal);
553+
554+
anom_entries.clear();
555+
normal_entries.clear();
556+
557+
{ //create prov anew to ensure normal event logic works as expected
558+
ADAnomalyProvenance prov(event_man);
559+
prov.getProvenanceEntries(anom_entries, normal_entries, anoms, 0, 800, 1200);
560+
}
561+
ASSERT_EQ(anom_entries.size(),2);
562+
ASSERT_EQ(normal_entries.size(),2);
563+
564+
got = normal_entries[0]["event_id"]; expect = event_its[4]->get_id().toString();
565+
EXPECT_EQ(got,expect);
566+
567+
got = normal_entries[1]["event_id"]; expect = event_its[5]->get_id().toString();
568+
EXPECT_EQ(got,expect);
569+
570+
//Test the minimum runtime
571+
anom_entries.clear();
572+
normal_entries.clear();
573+
574+
{ //create prov anew to ensure normal event logic works as expected
575+
ADAnomalyProvenance prov(event_man);
576+
prov.setMinimumAnomalyTime(60); //exclude second anomaly
577+
prov.getProvenanceEntries(anom_entries, normal_entries, anoms, 0, 800, 1200);
578+
}
579+
580+
ASSERT_EQ(anom_entries.size(),1);
581+
ASSERT_EQ(normal_entries.size(),1);
582+
583+
got = anom_entries[0]["event_id"]; expect = event_its[1]->get_id().toString();
584+
EXPECT_EQ(got,expect);
585+
got = normal_entries[0]["event_id"]; expect = event_its[4]->get_id().toString();
586+
EXPECT_EQ(got,expect);
587+
588+
589+
//Check that if we have multiple anomalies for the same function we only get one normal event
590+
events.push_back( createFuncExecData_t(1,2,3, 33, "thefunc", 700, 75) );
591+
event_its.push_back(event_man.addCall(events.back()));
592+
anoms.insert(event_its.back(),Anomalies::EventType::Outlier);
593+
594+
anom_entries.clear();
595+
normal_entries.clear();
596+
597+
{
598+
ADAnomalyProvenance prov(event_man);
599+
prov.getProvenanceEntries(anom_entries, normal_entries, anoms, 0, 800, 1200);
600+
}
601+
ASSERT_EQ(anom_entries.size(),3);
602+
ASSERT_EQ(normal_entries.size(),2);
603+
}

0 commit comments

Comments
 (0)