Skip to content

Commit 94ec04f

Browse files
author
sandeepmittal
committed
Fix in merging histograms when bin_edges=0
1 parent 91eb0de commit 94ec04f

2 files changed

Lines changed: 59 additions & 53 deletions

File tree

src/ad/ADOutlier.cpp

Lines changed: 45 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ Anomalies ADOutlierHBOS::run(int step) {
247247
HbosParam& g = *(HbosParam*)m_param;
248248
for (auto it : *m_execDataMap) { //loop over functions (key is function index)
249249
unsigned long func_id = it.first;
250+
Histogram &hist = param[func_id];
250251
std::vector<double> runtimes;
251252
for (auto itt : it.second) { //loop over events for that function
252253
if (itt->get_label() == 0) {
@@ -267,17 +268,17 @@ Anomalies ADOutlierHBOS::run(int step) {
267268
if (runtimes.size() > 0) {
268269
if (!g.find(func_id)) { // If func_id does not exist
269270

270-
const int r = param[func_id].create_histogram(runtimes);
271+
const int r = hist.create_histogram(runtimes);
271272
if (r < 0) {
272273
recoverable_error(std::string("AD: Func_ID does not exist"));
273274
continue;
274275
}
275276
}
276277
else { //merge with exisiting func_id, not overwrite
277278

278-
const int r = param[func_id].merge_histograms(g[func_id], runtimes);
279+
const int r = hist.merge_histograms(g[func_id], runtimes);
279280
if (r < 0) {
280-
recoverable_error(std::string("AD: Merging error received "));
281+
verboseStream << "AD: Merging reset " << std::endl;
281282
continue;
282283
}
283284
}
@@ -315,16 +316,16 @@ unsigned long ADOutlierHBOS::compute_outliers(Anomalies &outliers,
315316
verboseStream << "Finding outliers in events for func " << func_id << std::endl;
316317

317318
HbosParam& param = *(HbosParam*)m_param;
318-
319+
Histogram &hist = param[func_id];
319320

320321
unsigned long n_outliers = 0;
321322

322323
//probability of runtime counts
323-
std::vector<double> prob_counts = std::vector<double>(param[func_id].counts().size(), 0.0);
324-
double tot_runtimes = std::accumulate(param[func_id].counts().begin(), param[func_id].counts().end(), 0.0);
324+
std::vector<double> prob_counts = std::vector<double>(hist.counts().size(), 0.0);
325+
double tot_runtimes = std::accumulate(hist.counts().begin(), hist.counts().end(), 0.0);
325326

326-
for(int i=0; i < param[func_id].counts().size(); i++){
327-
int count = param[func_id].counts().at(i);
327+
for(int i=0; i < hist.counts().size(); i++){
328+
int count = hist.counts().at(i);
328329
double p = count / tot_runtimes;
329330
prob_counts.at(i) += p;
330331

@@ -338,7 +339,7 @@ unsigned long ADOutlierHBOS::compute_outliers(Anomalies &outliers,
338339
for(int i=0; i < prob_counts.size(); i++){
339340
double l = -1 * log2(prob_counts.at(i) + m_alpha);
340341
out_scores_i.push_back(l);
341-
verboseStream << "Count: " << param[func_id].counts().at(i) << ", Probability: " << prob_counts.at(i) << ", score: "<< l << std::endl;
342+
verboseStream << "Count: " << hist.counts().at(i) << ", Probability: " << prob_counts.at(i) << ", score: "<< l << std::endl;
342343
if(prob_counts.at(i) > 0) {
343344
if(l < min_score){
344345
min_score = l;
@@ -356,20 +357,19 @@ unsigned long ADOutlierHBOS::compute_outliers(Anomalies &outliers,
356357
if (out_scores_i.size() <= 0) {return 0;}
357358

358359
//compute threshold
359-
verboseStream << "Global threshold before comparison with local threshold = " << param[func_id].get_threshold() << std::endl;
360+
verboseStream << "Global threshold before comparison with local threshold = " << hist.get_threshold() << std::endl;
360361
double l_threshold = min_score + (m_threshold * (max_score - min_score));
361362
if(m_use_global_threshold) {
362-
if(l_threshold < param[func_id].get_threshold()) {
363-
l_threshold = param[func_id].get_threshold();
363+
if(l_threshold < hist.get_threshold()) {
364+
l_threshold = hist.get_threshold();
364365
} else {
365-
param[func_id].set_glob_threshold(l_threshold); //.get_histogram().glob_threshold = l_threshold;
366-
//std::pair<size_t, size_t> msgsz_thres_update = sync_param(&param);
366+
hist.set_glob_threshold(l_threshold);
367367
}
368368
}
369369

370370
//Compute HBOS based score for each datapoint
371-
const double bin_width = param[func_id].bin_edges().at(1) - param[func_id].bin_edges().at(0);
372-
const int num_bins = param[func_id].counts().size();
371+
const double bin_width = hist.bin_edges().at(1) - hist.bin_edges().at(0);
372+
const int num_bins = hist.counts().size();
373373
verboseStream << "Bin width: " << bin_width << std::endl;
374374

375375
int top_out = 0;
@@ -379,20 +379,20 @@ unsigned long ADOutlierHBOS::compute_outliers(Anomalies &outliers,
379379
const double runtime_i = this->getStatisticValue(*itt); //runtimes.push_back(this->getStatisticValue(*itt));
380380
double ad_score;
381381

382-
const int bin_ind = ADOutlierHBOS::np_digitize_get_bin_inds(runtime_i, param[func_id].bin_edges());
383-
verboseStream << "bin_ind: " << bin_ind << " for runtime_i: " << runtime_i << ", where bin_edges Size:" << param[func_id].bin_edges().size() << " & num_bins: "<< num_bins << std::endl;
382+
const int bin_ind = ADOutlierHBOS::np_digitize_get_bin_inds(runtime_i, hist.bin_edges());
383+
verboseStream << "bin_ind: " << bin_ind << " for runtime_i: " << runtime_i << ", where bin_edges Size:" << hist.bin_edges().size() << " & num_bins: "<< num_bins << std::endl;
384384
/**
385-
* If the sample does not belong to any bins
385+
* Sample (datapoint) can be in either first bin or does not belong to any bins
386386
* bin_ind == 0 (fall outside since it is too small)
387387
*/
388388
if( bin_ind == 0){
389-
const double first_bin_edge = param[func_id].bin_edges().at(0);
389+
const double first_bin_edge = hist.bin_edges().at(0);
390390
const double dist = first_bin_edge - runtime_i;
391391
verboseStream << "First_bin_edge: " << first_bin_edge << std::endl;
392392
if( dist <= (bin_width * 0.05) ){
393-
verboseStream << runtime_i << " is on left of histogram but NOT outlier" << std::endl;
394-
if(param[func_id].counts().size() < 1) {return 0;}
395-
if(param[func_id].counts().at(0) == 0) { /**< Ignore zero counts */
393+
verboseStream << runtime_i << " is in first bin of Histogram but NOT outlier" << std::endl;
394+
if(hist.counts().size() < 1) {return 0;}
395+
if(hist.counts().at(0) == 0) { /**< Ignore zero counts */
396396

397397
ad_score = l_threshold - 1;
398398
verboseStream << "corrected ad_score: " << ad_score << std::endl;
@@ -403,7 +403,7 @@ unsigned long ADOutlierHBOS::compute_outliers(Anomalies &outliers,
403403
}
404404
}
405405
else{
406-
verboseStream << runtime_i << " is on left of histogram and an outlier" << std::endl;
406+
verboseStream << runtime_i << " is NOT in first bin of Histogram and it IS an outlier" << std::endl;
407407
ad_score = max_score;
408408
verboseStream << "ad_score(max_score): " << ad_score << std::endl;
409409
}
@@ -413,12 +413,12 @@ unsigned long ADOutlierHBOS::compute_outliers(Anomalies &outliers,
413413
* If the sample does not belong to any bins
414414
*/
415415
else if(bin_ind == num_bins + 1){
416-
const int last_idx = param[func_id].bin_edges().size() - 1;
417-
const double last_bin_edge = param[func_id].bin_edges().at(last_idx);
416+
const int last_idx = hist.bin_edges().size() - 1;
417+
const double last_bin_edge = hist.bin_edges().at(last_idx);
418418
const double dist = runtime_i - last_bin_edge;
419419
verboseStream << "last_indx: " << last_idx << ", last_bin_edge: " << last_bin_edge << std::endl;
420420
if (dist <= (bin_width * 0.05)) {
421-
if(param[func_id].counts().at(num_bins - 1) == 0) { //bin_ind) == 0) { /**< Ignore zero counts */
421+
if(hist.counts().at(num_bins - 1) == 0) { /**< Ignore zero counts */
422422

423423
ad_score = l_threshold - 1;
424424
verboseStream << "corrected ad_score: " << ad_score << std::endl;
@@ -438,7 +438,7 @@ unsigned long ADOutlierHBOS::compute_outliers(Anomalies &outliers,
438438
}
439439
else {
440440

441-
if(param[func_id].counts().at(bin_ind) == 0) { /**< Ignore zero counts */
441+
if(hist.counts().at(bin_ind) == 0) { /**< Ignore zero counts */
442442

443443
ad_score = l_threshold - 1;
444444
verboseStream << "corrected ad_score: " << ad_score << std::endl;
@@ -533,6 +533,7 @@ Anomalies ADOutlierCOPOD::run(int step) {
533533
CopodParam& g = *(CopodParam*)m_param;
534534
for (auto it : *m_execDataMap) { //loop over functions (key is function index)
535535
unsigned long func_id = it.first;
536+
Histogram &hist = param[func_id];
536537
std::vector<double> runtimes;
537538
for (auto itt : it.second) { //loop over events for that function
538539
if (itt->get_label() == 0) {
@@ -552,17 +553,17 @@ Anomalies ADOutlierCOPOD::run(int step) {
552553
}
553554
if (runtimes.size() > 0) {
554555
if (!g.find(func_id)) { // If func_id does not exist
555-
const int r = param[func_id].create_histogram(runtimes);
556+
const int r = hist.create_histogram(runtimes);
556557
if (r < 0) {
557558
recoverable_error(std::string("AD: Func_ID does not exist "));
558559
continue;
559560
}
560561
}
561562
else { //merge with exisiting func_id, not overwrite
562563

563-
const int r = param[func_id].merge_histograms(g[func_id], runtimes);
564+
const int r = hist.merge_histograms(g[func_id], runtimes);
564565
if (r < 0) {
565-
recoverable_error(std::string("AD: Merging error received "));
566+
verboseStream << "AD: Merging reset " << std::endl;
566567
continue;
567568
}
568569
}
@@ -619,28 +620,28 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
619620
verboseStream << "data Size: " << data.size() << std::endl;
620621

621622
CopodParam& param = *(CopodParam*)m_param;
622-
623+
Histogram &hist = param[func_id];
623624

624625
unsigned long n_outliers = 0;
625626

626627
//probability of runtime counts
627628
//std::vector<double> prob_counts = std::vector<double>(param[func_id].counts().size(), 0.0);
628-
double tot_runtimes = std::accumulate(param[func_id].counts().begin(), param[func_id].counts().end(), 0.0);
629+
double tot_runtimes = std::accumulate(hist.counts().begin(), hist.counts().end(), 0.0);
629630

630631
if (tot_runtimes <= 0 ) {
631632
return n_outliers;
632633
}
633634
std::vector<double> recon_p_runtimes = std::vector<double>(tot_runtimes, 0.0);
634635
std::vector<double> recon_n_runtimes = std::vector<double>(tot_runtimes, 0.0);
635636
int recon_idx = 0;
636-
verboseStream << "Unwrapping Merged Histogram. Size: " << param[func_id].counts().size() << std::endl;
637-
for(int i=0; i < param[func_id].counts().size(); i++){
638-
int count = param[func_id].counts().at(i);
639-
verboseStream << "Count: " << count << ", Value: " << param[func_id].bin_edges().at(i) << std::endl;
637+
verboseStream << "Unwrapping Merged Histogram. Size: " << hist.counts().size() << std::endl;
638+
for(int i=0; i < hist.counts().size(); i++){
639+
int count = hist.counts().at(i);
640+
verboseStream << "Count: " << count << ", Value: " << hist.bin_edges().at(i) << std::endl;
640641
for(int j=0; j<count; j++){
641642

642-
recon_p_runtimes.at(recon_idx) = param[func_id].bin_edges().at(i);
643-
recon_n_runtimes.at(recon_idx) = -1 * param[func_id].bin_edges().at(i);
643+
recon_p_runtimes.at(recon_idx) = hist.bin_edges().at(i);
644+
recon_n_runtimes.at(recon_idx) = -1 * hist.bin_edges().at(i);
644645
verboseStream << "recon_idx: " << recon_idx << std::endl;
645646
verboseStream << "recon_p_runtimes.at(recon_idx): " << recon_p_runtimes.at(recon_idx) << ", recon_n_runtimes.at(recon_idx): " << recon_n_runtimes.at(recon_idx) << std::endl;
646647
recon_idx++;
@@ -678,9 +679,6 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
678679
}
679680

680681

681-
//for(int i=0; i<mean_pn_ecdf.size(); i++)
682-
// verboseStream << "mean_pn_ecdf at " << i << ": " << mean_pn_ecdf.at(i) << std::endl;
683-
684682
//Create COPOD score vector
685683
std::vector<double> out_scores_i = std::vector<double>(final_comp.size(), 0.0);
686684
verboseStream << "m_alpha: " << m_alpha << std::endl;
@@ -710,14 +708,14 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
710708
if (out_scores_i.size() <= 0) {return 0;}
711709

712710
//compute threshold
713-
verboseStream << "Global threshold before comparison with local threshold = " << param[func_id].get_threshold() << std::endl;
711+
verboseStream << "Global threshold before comparison with local threshold = " << hist.get_threshold() << std::endl;
714712
double l_threshold = (max_score < 0) ? (-1 * m_threshold * (max_score - min_score)) : min_score + (m_threshold * (max_score - min_score));
715713
verboseStream << "l_threshold computed: " << l_threshold << std::endl;
716714
if(m_use_global_threshold) {
717-
if(l_threshold < param[func_id].get_threshold() && param[func_id].get_threshold() > (-1 * log2(1.00001))) {
718-
l_threshold = param[func_id].get_threshold();
715+
if(l_threshold < hist.get_threshold() && hist.get_threshold() > (-1 * log2(1.00001))) {
716+
l_threshold = hist.get_threshold();
719717
} else {
720-
param[func_id].set_glob_threshold(l_threshold); //.get_histogram().glob_threshold = l_threshold;
718+
hist.set_glob_threshold(l_threshold); //.get_histogram().glob_threshold = l_threshold;
721719
//std::pair<size_t, size_t> msgsz_thres_update = sync_param(&param);
722720
}
723721
}
@@ -743,7 +741,7 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
743741
running_idx++;
744742
}
745743
else {
746-
recoverable_error("AD: COPOD: runtime Index");
744+
verboseStream << "AD: COPOD: runtime Index" << std::endl;
747745
continue;
748746
}
749747

src/param/hbos_param.cpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ using namespace chimbuko;
344344
var = var / size;
345345
verboseStream << "Final Variance in _scott_binWidth: " << var << std::endl;
346346
std = sqrt(var);
347-
verboseStream << "STD in merging _scott_binWidth: " << std << std::endl;
347+
verboseStream << "stddev in merging _scott_binWidth: " << std << std::endl;
348348
if (std <= 100.0) {return 0;}
349349

350350
return ((3.5 * std ) / pow(size, 1/3));
@@ -363,7 +363,7 @@ using namespace chimbuko;
363363
}
364364
var = var / vals.size();
365365
std = sqrt(var);
366-
verboseStream << "STD in _scott_binWidth: " << std << std::endl;
366+
verboseStream << "stddev in _scott_binWidth: " << std << std::endl;
367367

368368
return ((3.5 * std ) / pow(vals.size(), 1/3));
369369
}
@@ -397,7 +397,7 @@ using namespace chimbuko;
397397
m_histogram.bin_edges.push_back(prev + bin_width);
398398
prev += bin_width;
399399
}
400-
//std::cout << "Number of bins: " << m_histogram.bin_edges.size()-1 << std::endl;
400+
verboseStream << "Number of bins: " << m_histogram.bin_edges.size()-1 << std::endl;
401401

402402
if (m_histogram.counts.size() > 0) m_histogram.counts.clear();
403403
m_histogram.counts = std::vector<int>(m_histogram.bin_edges.size()-1, 0);
@@ -409,7 +409,7 @@ using namespace chimbuko;
409409
}
410410
}
411411
}
412-
//std::cout << "Size of counts: " << m_histogram.counts.size() << std::endl;
412+
verboseStream << "Size of counts: " << m_histogram.counts.size() << std::endl;
413413

414414
//m_histogram.runtimes.clear();
415415
const double min_threshold = -1 * log2(1.00001);
@@ -423,10 +423,18 @@ using namespace chimbuko;
423423
int Histogram::merge_histograms(const Histogram& g, const std::vector<double>& runtimes)
424424
{
425425

426-
std::vector<double> r_times = runtimes;
426+
std::vector<double> r_times(runtimes.size()); // = runtimes;
427+
verboseStream << "Number of runtime events during mergin: " << runtimes.size() << std::endl;
428+
verboseStream << "total number of 'g' bin_edges: " << g.bin_edges().size() << std::endl;
427429

430+
//Fix for XGC run where unlabelled func_id is retained causing Zero bin_edges
431+
if (g.bin_edges().size() <= 0)
432+
return this->create_histogram(runtimes);
433+
434+
//Unwrapping the histogram
428435
for (int i = 0; i < g.bin_edges().size() - 1; i++) {
429-
for(int j = 0; j < g.counts().at(i); j++){
436+
verboseStream << " Bin counts in " << i << ": " << g.counts()[i] << std::endl;
437+
for(int j = 0; j < g.counts().at(i); j++){
430438
r_times.push_back(g.bin_edges().at(i));
431439
}
432440
}

0 commit comments

Comments
 (0)