Skip to content

Commit d076400

Browse files
committed
Improved handling of events for which no global model information yet exists. The model can be empty if the current step is the first step on which a function has been seen, and/or if function data has not had time to pass through the parameter server. Previously such events were just labeled as normal, on which is reasonable in most cases. However this approach causes problems for GPU events, particularly for short-running programs, as the GPU event traces often appear as a big dump on a single step long after the function was executed, resulting in GPU events not being properly labeled. To combat this, events for which no model data is available will be left unlabeled (this is no longer cause for an error) and held until a future step where a model becomes available. Associated call-stack events are also maintained so as to ensure complete provenance output.
1 parent 96a23c5 commit d076400

7 files changed

Lines changed: 207 additions & 97 deletions

File tree

include/chimbuko/modules/performance_analysis/ad/ADEvent.hpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ namespace chimbuko {
248248
* throws a runtime error if the call is not present in the call-list
249249
*/
250250
CallListIterator_t getCallData(const eventID &event_id) const override;
251-
251+
252252
/**
253253
* @brief Get a pair of iterators marking the start and one-past-the-end of a window of size (up to) win_size events
254254
* on either size around the given event occurring on the same thread
@@ -329,6 +329,7 @@ namespace chimbuko {
329329
size_t n_kept_protected; /**< Number of calls maintained because they have been protected*/
330330
size_t n_kept_incomplete; /**< Number of calls maintained because they have not yet completed*/
331331
size_t n_kept_window; /**< Number of calls maintained because they may be needed for provenance window capture on next io step*/
332+
size_t n_kept_unlabeled; /**< Number of calls maintained because they have not yet been labeled*/
332333
};
333334

334335

@@ -414,7 +415,7 @@ namespace chimbuko {
414415
/**
415416
* @brief map of function index to an array of complete calls to this function during this IO step
416417
*
417-
* In practise this map is cleared every IO step by calls to trimCallList
418+
* In practise, labeled events are cleared every IO step by calls to purgeCallList
418419
*/
419420
ExecDataMap_t m_execDataMap;
420421

@@ -440,6 +441,12 @@ namespace chimbuko {
440441
*/
441442
std::unordered_set<std::string> m_ignoreCorrelationID;
442443

444+
445+
/**
446+
* @brief Set of events that have been stack locked because they weren't able to be labeled
447+
*/
448+
std::unordered_set<eventID> m_stackLockedUnlabeled;
449+
443450
/**
444451
* @brief verbose
445452
*

include/chimbuko/modules/performance_analysis/ad/ExecData.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -602,7 +602,7 @@ namespace chimbuko {
602602
/**
603603
* @brief Determine whether the event can be deleted by the garbage collection at the end of the io step
604604
*/
605-
inline bool can_delete() const{ return m_references == 0;}
605+
inline bool can_delete() const{ return m_references == 0 && m_label != 0;}
606606

607607
/**
608608
* @brief Increment the external reference counter, preventing object deletion

src/core/ad/ADDataInterface.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@ void ADDataInterface::DataSetAnomalies::recordEvent(const Elem &event){
88
//Store a max of one normal event. Select that with the lower score (more normal)
99
if(m_normal_events.size() == 0) m_normal_events.push_back(event);
1010
else if(event.score < m_normal_events[0].score) m_normal_events[0] = event;
11-
}else{ //Outlier
11+
++m_labeled_events; //increment even if not recorded
12+
}else if(event.label == EventType::Outlier){ //Outlier
1213
m_anomalies.push_back(event);
14+
++m_labeled_events; //increment even if not recorded
1315
}
14-
++m_labeled_events; //increment even if not recorded
1516
}
1617

1718
size_t ADDataInterface::nEventsRecorded(EventType type) const{
@@ -29,9 +30,6 @@ size_t ADDataInterface::nEvents() const{
2930
}
3031

3132
void ADDataInterface::recordDataSetLabels(const std::vector<Elem> &data, size_t dset_index){
32-
//Check first for unassigned labels before we modify anything
33-
for(auto const &e : data) if(e.label == EventType::Unassigned){ fatal_error("Encountered unassigned label when recording; the AD algorithm should label all data points assigned!"); }
34-
3533
auto &danom = m_dset_anom[dset_index];
3634
for(auto const &e : data) danom.recordEvent(e);
3735
this->recordDataSetLabelsInternal(data,dset_index);

src/core/ad/ADOutlier.cpp

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -269,32 +269,22 @@ double ADOutlierSSTD::computeScore(double value, size_t model_idx, const SstdPar
269269
double std_dev = it->second.stddev();
270270
if(std_dev == 0.) std_dev = 1e-10; //distribution throws an error if std.dev = 0
271271

272-
//boost::math::normal_distribution<double> dist(mean, std_dev);
273-
//double cdf_val = boost::math::cdf(dist, runtime); // P( X <= x ) for random variable X
274-
//double score = std::min(cdf_val, 1-cdf_val); //two-tailed
275-
276-
//Using the CDF gives scores ~0 for basically any global outlier
277-
//Instead we will use the difference in runtime compared to the avg in units of the standard deviation
272+
//We use the difference in runtime compared to the avg in units of the standard deviation
278273
double score = fabs( value - mean ) / std_dev;
279274
return score;
280275
}
281276

282277

283278
void ADOutlierSSTD::labelData(std::vector<ADDataInterface::Elem> &data_vals, size_t dset_idx, size_t model_idx){
284-
285-
verboseStream << "Finding outliers in events for data set " << dset_idx << " of size " << data_vals.size() << std::endl;
286-
279+
verboseStream << "Finding outliers in events for data set " << dset_idx << " of size " << data_vals.size() << " and model idx " << model_idx << std::endl;
280+
287281
if(data_vals.size() == 0) return;
288282

289283
SstdParam& param = *(SstdParam*)m_param;
290284
auto & fparam = param[model_idx];
291285

292-
if (fparam.count() < 2){
293-
verboseStream << "Less than 2 events in stats associated with data set, stats not complete" << std::endl;
294-
for(auto &e : data_vals){ //still need to label all events
295-
e.label = ADDataInterface::EventType::Normal;
296-
e.score = 0;
297-
}
286+
if (fparam.count() < 2){ //TODO: This will probably cause problems if we only ever see <2 calls to the function!
287+
verboseStream << "Less than 2 events in stats associated with data set, stats not complete, delaying evaluation" << std::endl;
298288
return;
299289
}
300290

@@ -387,9 +377,8 @@ void ADOutlierHBOS::labelData(std::vector<ADDataInterface::Elem> &data_vals, siz
387377
//Check that the histogram contains bins
388378
if(nbin == 0){
389379
//As the pserver global model update is delayed, initially the clients may receive an empty model from the pserver for this data set
390-
//Given that the model at this stage is unreliable anyway, we simply skip the set and label the data as normal
380+
//To combat this, we hold off on labeling the data until we get a global model back
391381
verboseStream << "Global model is empty, skipping outlier evaluation for data set " << dset_idx << std::endl;
392-
for(auto &e : data_vals) e.label = ADDataInterface::EventType::Normal;
393382
return;
394383
}
395384

@@ -628,8 +617,7 @@ inline double copod_score(const double value_i, const Histogram &hist, const His
628617

629618

630619
void ADOutlierCOPOD::labelData(std::vector<ADDataInterface::Elem> &data_vals, size_t dset_idx, size_t model_idx){
631-
verboseStream << "Finding outliers in events for data set " << dset_idx << std::endl;
632-
verboseStream << "data Size: " << data_vals.size() << std::endl;
620+
verboseStream << "Finding outliers in events for data set " << dset_idx << " of size " << data_vals.size() << " and model idx " << model_idx << std::endl;
633621

634622
if(data_vals.size() == 0) return;
635623

@@ -648,9 +636,8 @@ void ADOutlierCOPOD::labelData(std::vector<ADDataInterface::Elem> &data_vals, si
648636
//Check that the histogram contains bins
649637
if(nbin == 0){
650638
//As the pserver global model update is delayed, initially the clients may receive an empty model from the pserver for this function
651-
//Given that the model at this stage is unreliable anyway, we simply skip the function and label the data as normal
639+
//To combat this we delay evaluation of the score until we get some data
652640
verboseStream << "Global model is empty, skipping outlier evaluation for data set " << dset_idx << std::endl;
653-
for(auto &e : data_vals) e.label = ADDataInterface::EventType::Normal;
654641
return;
655642
}
656643

src/modules/performance_analysis/ad/ADEvent.cpp

Lines changed: 69 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -46,37 +46,51 @@ void ADEvent::stackProtectGC(CallListIterator_t it){
4646
verboseStream << "ADEvent::stackProtectGC incrementing register count of " << it->get_id().toString() << " to " << it->reference_count()+1 << std::endl;
4747
it->register_reference();
4848

49-
eventID parent = it->get_parent();
50-
while(parent != eventID::root()){
51-
CallListIterator_t pit;
52-
try{
53-
pit = getCallData(parent);
54-
}catch(const std::exception &e){
55-
recoverable_error("Could not find parent " + parent.toString() + " in call list due to : " + e.what());
56-
break;
49+
std::vector<eventID> parents(1, it->get_parent());
50+
//Also lock GPU event parents
51+
if(m_gpu_thread_Map != nullptr && m_gpu_thread_Map->count(it->get_tid()) && it->n_GPU_correlationID_partner() == 1){
52+
parents.push_back(it->get_GPU_correlationID_partner(0));
53+
}
54+
55+
for(auto parent : parents){
56+
while(parent != eventID::root()){
57+
CallListIterator_t pit;
58+
try{
59+
pit = getCallData(parent);
60+
}catch(const std::exception &e){
61+
recoverable_error("Could not find parent " + parent.toString() + " in call list due to : " + e.what());
62+
break;
63+
}
64+
verboseStream << "ADEvent::stackProtectGC incrementing register count of " << pit->get_id().toString() << " to " << pit->reference_count()+1 << std::endl;
65+
pit->register_reference();
66+
parent = pit->get_parent();
5767
}
58-
verboseStream << "ADEvent::stackProtectGC incrementing register count of " << pit->get_id().toString() << " to " << pit->reference_count()+1 << std::endl;
59-
pit->register_reference();
60-
parent = pit->get_parent();
6168
}
6269
}
6370

6471
void ADEvent::stackUnProtectGC(CallListIterator_t it){
6572
verboseStream << "ADEvent::stackUnProtectGC decrementing register count of " << it->get_id().toString() << " to " << it->reference_count()-1 << std::endl;
6673
it->deregister_reference();
6774

68-
eventID parent = it->get_parent();
69-
while(parent != eventID::root()){
70-
CallListIterator_t pit;
71-
try{
72-
pit = getCallData(parent);
73-
}catch(const std::exception &e){
74-
recoverable_error("Could not find parent " + parent.toString() + " in call list due to : " + e.what());
75-
break;
75+
std::vector<eventID> parents(1, it->get_parent());
76+
//Also unlock GPU event parents
77+
if(m_gpu_thread_Map != nullptr && m_gpu_thread_Map->count(it->get_tid()) && it->n_GPU_correlationID_partner() == 1){
78+
parents.push_back(it->get_GPU_correlationID_partner(0));
79+
}
80+
81+
for(auto parent : parents){
82+
while(parent != eventID::root()){
83+
CallListIterator_t pit;
84+
try{
85+
pit = getCallData(parent);
86+
}catch(const std::exception &e){
87+
recoverable_error("Could not find parent " + parent.toString() + " in call list due to : " + e.what());
88+
break;
89+
}
90+
verboseStream << "ADEvent::stackUnProtectGC decrementing register count of " << pit->get_id().toString() << " to " << pit->reference_count()-1 << std::endl;
91+
pit->deregister_reference();
92+
parent = pit->get_parent();
7693
}
77-
verboseStream << "ADEvent::stackUnProtectGC decrementing register count of " << pit->get_id().toString() << " to " << pit->reference_count()-1 << std::endl;
78-
pit->deregister_reference();
79-
parent = pit->get_parent();
8094
}
8195
}
8296

@@ -366,6 +380,17 @@ static unsigned long nested_map_size(const T& m) {
366380
return n_elements;
367381
}
368382

383+
static void clearUnlabeled(ExecDataMap_t &execDataMap){
384+
for(auto &fid_p : execDataMap){
385+
std::vector<CallListIterator_t> keep;
386+
for(const CallListIterator_t &cit : fid_p.second)
387+
if(cit->get_label() == 0) keep.push_back(cit);
388+
fid_p.second = std::move(keep);
389+
verboseStream << "clearUnlabeled kept " << fid_p.second.size() << " unlabeled points for fid " << fid_p.first << std::endl;
390+
}
391+
}
392+
393+
369394
CallListMap_p_t* ADEvent::trimCallList(int n_keep_thread) {
370395
//Remove completed entries from the call list
371396
CallListMap_p_t* cpListMap = new CallListMap_p_t;
@@ -400,13 +425,13 @@ CallListMap_p_t* ADEvent::trimCallList(int n_keep_thread) {
400425
}
401426
}
402427
}
403-
m_execDataMap.clear();
428+
clearUnlabeled(m_execDataMap);
404429
return cpListMap;
405430
}
406431

407432

408433
void ADEvent::purgeCallList(int n_keep_thread, purgeReport* report) {
409-
size_t n_purged=0, n_kept_protected=0, n_kept_incomplete=0, n_kept_window=0;
434+
size_t n_purged=0, n_kept_protected=0, n_kept_incomplete=0, n_kept_window=0, n_kept_unlabeled=0;
410435

411436
//Remove completed entries from the call list
412437
for (auto& it_p : m_callList) {
@@ -425,6 +450,14 @@ void ADEvent::purgeCallList(int n_keep_thread, purgeReport* report) {
425450
auto one_past_last = std::prev(cl.end(),n_keep_thread);
426451

427452
while (it != one_past_last) {
453+
//Check if entry and stack was locked due to being unlabeled, and if now labeled, unlock
454+
std::unordered_set<eventID>::iterator sl_it;
455+
if(it->get_label() != 0 && (sl_it = m_stackLockedUnlabeled.find(it->get_id()) ) != m_stackLockedUnlabeled.end()){
456+
verboseStream << "Previously unlabeled event " << it->get_id().toString() << " has now been labeled, unlocking stack" << std::endl;
457+
stackUnProtectGC(it);
458+
m_stackLockedUnlabeled.erase(sl_it);
459+
}
460+
428461
if (it->can_delete() && it->get_exit() != 0) {
429462
//Remove completed event from map of event index string to call list
430463
m_callIDMap.erase(it->get_id());
@@ -434,7 +467,16 @@ void ADEvent::purgeCallList(int n_keep_thread, purgeReport* report) {
434467
}
435468
else {
436469
if(it->get_exit() == 0) ++n_kept_incomplete;
437-
else if(!it->can_delete()) ++n_kept_protected;
470+
else if(!it->can_delete()){
471+
if(it->get_label() == 0 && m_stackLockedUnlabeled.count(it->get_id()) == 0){
472+
//Lock the stack to ensure that events referred to don't get erased
473+
verboseStream << "Event " << it->get_id().toString() << " is not labeled, locking stack" << std::endl;
474+
stackProtectGC(it);
475+
m_stackLockedUnlabeled.insert(it->get_id());
476+
++n_kept_unlabeled;
477+
}
478+
else ++n_kept_protected;
479+
}
438480
it++;
439481
}
440482
}
@@ -446,9 +488,9 @@ void ADEvent::purgeCallList(int n_keep_thread, purgeReport* report) {
446488
report->n_kept_protected = n_kept_protected;
447489
report->n_kept_incomplete = n_kept_incomplete;
448490
report->n_kept_window = n_kept_window;
491+
report->n_kept_unlabeled = n_kept_unlabeled;
449492
}
450-
451-
m_execDataMap.clear();
493+
clearUnlabeled(m_execDataMap);
452494
}
453495

454496
size_t ADEvent::getCallListSize() const{

src/modules/performance_analysis/ad/ADExecDataInterface.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ void ADExecDataInterface::recordDataSetLabelsInternal(const std::vector<Elem> &d
8080
for(auto const &e: data){
8181
CallListIterator_t eint = it->second[e.index];
8282
eint->set_outlier_score(e.score);
83-
if(e.label == EventType::Unassigned){ fatal_error("Encountered an unassigned label when recording"); }
84-
eint->set_label( e.label == EventType::Outlier ? -1 : 1 );
83+
if(e.label != EventType::Unassigned){
84+
eint->set_label( e.label == EventType::Outlier ? -1 : 1 );
85+
}
8586
}
8687
}

0 commit comments

Comments
 (0)