Skip to content

Commit aa7aa49

Browse files
author
Sandeep Mittal
committed
cleanup
1 parent 9026128 commit aa7aa49

3 files changed

Lines changed: 42 additions & 93 deletions

File tree

include/chimbuko/ad/ADOutlier.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ namespace chimbuko {
304304
/**
305305
* @brief Computes Empirical CDF of input vector of function runtimes
306306
*/
307-
auto empiricalCDF(const std::vector<double>& runtimes, const bool sorted=true);
307+
std::vector<double> empiricalCDF(const std::vector<double>& runtimes, const bool sorted=true);
308308

309309
private:
310310
double m_alpha; /**< Used to prevent log2 overflow */

src/ad/ADOutlier.cpp

Lines changed: 40 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -587,43 +587,52 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
587587
unsigned long n_outliers = 0;
588588

589589
//probability of runtime counts
590-
std::vector<double> prob_counts = std::vector<double>(param[func_id].counts().size(), 0.0);
590+
//std::vector<double> prob_counts = std::vector<double>(param[func_id].counts().size(), 0.0);
591591
double tot_runtimes = std::accumulate(param[func_id].counts().begin(), param[func_id].counts().end(), 0.0);
592592

593-
std::vector<double> recon_runtimes = std::vector<double>(tot_runtimes, 0.0);
593+
std::vector<double> recon_p_runtimes = std::vector<double>(tot_runtimes, 0.0);
594+
std::vector<double> recon_n_runtimes = std::vector<double>(tot_runtimes, 0.0);
594595
int recon_idx = 0;
595596
for(int i=0; i < param[func_id].counts().size(); i++){
596597
int count = param[func_id].counts().at(i);
597598
for(int j=0; j<count; j++){
598-
recon_runtimes.at(recon_idx++) = param[func_id].bin_edges.at(i);
599+
recon_p_runtimes.at(recon_idx++) = param[func_id].bin_edges().at(i);
600+
recon_n_runtimes.at(recon_idx++) = -1 * param[func_id].bin_edges().at(i);
599601
}
600602
}
601603

602-
auto func_ecdf = empiricalCDF(recon_runtimes, true);
604+
std::vector<double> func_p_ecdf = empiricalCDF(recon_p_runtimes, true);
605+
std::vector<double> func_n_ecdf = empiricalCDF(recon_n_runtimes, true);
603606

604-
for(int i=0; i < param[func_id].counts().size(); i++){
605-
int count = param[func_id].counts().at(i);
606-
double p = count / tot_runtimes;
607-
prob_counts.at(i) += p;
607+
std::vector<double> mean_pn_ecdf = std::vector<double>(func_p_ecdf.size(), 0.0);
608+
for(int i=0; i < mean_pn_ecdf.size(); i++){
609+
mean_pn_ecdf.at(i) = (func_p_ecdf.at(i) + func_n_ecdf.at(i)) / 2;
608610
}
609611

612+
613+
//for(int i=0; i < param[func_id].counts().size(); i++){
614+
// int count = param[func_id].counts().at(i);
615+
// double p = count / tot_runtimes;
616+
// prob_counts.at(i) += p;
617+
//}
618+
610619
//Create COPOD score vector
611620
std::vector<double> out_scores_i;
612621
double min_score = -1 * log2(0.0 + m_alpha);
613622
double max_score = -1 * log2(1.0 + m_alpha);
614623
verboseStream << "out_scores_i: " << std::endl;
615-
for(int i=0; i < prob_counts.size(); i++){
616-
double l = -1 * log2(prob_counts.at(i) + m_alpha);
624+
for(int i=0; i < mean_pn_ecdf.size(); i++){
625+
double l = -1 * log2(mean_pn_ecdf.at(i) + m_alpha);
617626
out_scores_i.push_back(l);
618-
verboseStream << "Count: " << param[func_id].counts().at(i) << ", Probability: " << prob_counts.at(i) << ", score: "<< l << std::endl;
619-
if(prob_counts.at(i) > 0) {
627+
//verboseStream << "Count: " << param[func_id].counts().at(i) << ", Probability: " << prob_counts.at(i) << ", score: "<< l << std::endl;
628+
//if(prob_counts.at(i) > 0) {
620629
if(l < min_score){
621630
min_score = l;
622631
}
623632
if(l > max_score){
624633
max_score = l;
625634
}
626-
}
635+
//}
627636
}
628637
verboseStream << std::endl;
629638
verboseStream << "out_score_i size: " << out_scores_i.size() << std::endl;
@@ -645,88 +654,22 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
645654
}
646655

647656
//Compute COPOD based score for each datapoint
648-
const double bin_width = param[func_id].bin_edges().at(1) - param[func_id].bin_edges().at(0);
649-
const int num_bins = param[func_id].counts().size();
650-
verboseStream << "Bin width: " << bin_width << std::endl;
657+
//const double bin_width = param[func_id].bin_edges().at(1) - param[func_id].bin_edges().at(0);
658+
//const int num_bins = param[func_id].counts().size();
659+
//verboseStream << "Bin width: " << bin_width << std::endl;
651660

652661
int top_out = 0;
662+
int running_idx = 0;
653663
for (auto itt : data) {
654664
if (itt->get_label() == 0) {
655665

656666
const double runtime_i = this->getStatisticValue(*itt); //runtimes.push_back(this->getStatisticValue(*itt));
657667
double ad_score;
658-
659-
const int bin_ind = ADOutlierCOPOD::np_digitize_get_bin_inds(runtime_i, param[func_id].bin_edges());
660-
verboseStream << "bin_ind: " << bin_ind << " for runtime_i: " << runtime_i << ", where bin_edges Size:" << param[func_id].bin_edges().size() << " & num_bins: "<< num_bins << std::endl;
661-
/**
662-
* If the sample does not belong to any bins
663-
* bin_ind == 0 (fall outside since it is too small)
664-
*/
665-
if( bin_ind == 0){
666-
const double first_bin_edge = param[func_id].bin_edges().at(0);
667-
const double dist = first_bin_edge - runtime_i;
668-
verboseStream << "First_bin_edge: " << first_bin_edge << std::endl;
669-
if( dist <= (bin_width * 0.05) ){
670-
verboseStream << runtime_i << " is on left of histogram but NOT outlier" << std::endl;
671-
if(param[func_id].counts().size() < 1) {return 0;}
672-
if(param[func_id].counts().at(0) == 0) { /**< Ignore zero counts */
673-
674-
ad_score = l_threshold - 1;
675-
verboseStream << "corrected ad_score: " << ad_score << std::endl;
676-
}
677-
else {
678-
ad_score = out_scores_i.at(0);
679-
verboseStream << "ad_score: " << ad_score << std::endl;
680-
}
681-
}
682-
else{
683-
verboseStream << runtime_i << " is on left of histogram and an outlier" << std::endl;
684-
ad_score = max_score;
685-
verboseStream << "ad_score(max_score): " << ad_score << std::endl;
686-
}
687-
688-
}
689-
/**
690-
* If the sample does not belong to any bins
691-
*/
692-
else if(bin_ind == num_bins + 1){
693-
const int last_idx = param[func_id].bin_edges().size() - 1;
694-
const double last_bin_edge = param[func_id].bin_edges().at(last_idx);
695-
const double dist = runtime_i - last_bin_edge;
696-
verboseStream << "last_indx: " << last_idx << ", last_bin_edge: " << last_bin_edge << std::endl;
697-
if (dist <= (bin_width * 0.05)) {
698-
if(param[func_id].counts().at(num_bins - 1) == 0) { //bin_ind) == 0) { /**< Ignore zero counts */
699-
700-
ad_score = l_threshold - 1;
701-
verboseStream << "corrected ad_score: " << ad_score << std::endl;
702-
}
703-
else {
704-
verboseStream << runtime_i << " is on right of histogram but NOT outlier" << std::endl;
705-
ad_score = out_scores_i.at(num_bins - 1);
706-
verboseStream << "ad_score: " << ad_score << ", num_bins: " << num_bins << ", out_scores_i size: " << out_scores_i.size() << std::endl;
707-
}
708-
}
709-
else{
710-
verboseStream << runtime_i << " is on right of histogram and an outlier" << std::endl;
711-
ad_score = max_score;
712-
verboseStream << "ad_score(max_score): " << ad_score << ", num_bins: " << num_bins << ", out_scores_i size: " << out_scores_i.size() << std::endl;
713-
}
714-
715-
}
716-
else {
717-
718-
if(param[func_id].counts().at(bin_ind) == 0) { /**< Ignore zero counts */
719-
720-
ad_score = l_threshold - 1;
721-
verboseStream << "corrected ad_score: " << ad_score << std::endl;
722-
}
723-
else {
724-
verboseStream << runtime_i << " maybe be an outlier" << std::endl;
725-
ad_score = out_scores_i.at( bin_ind - 1);
726-
verboseStream << "ad_score(else): " << ad_score << ", bin_ind: " << bin_ind << ", num_bins: " << num_bins << ", out_scores_i size: " << out_scores_i.size() << std::endl;
727-
}
728-
729-
}
668+
669+
if (mean_pn_ecdf.at(running_idx++) > 0)
670+
ad_score = l_threshold + 1;
671+
else
672+
ad_score = l_threshold - 1;
730673

731674
itt->set_outlier_score(ad_score);
732675
verboseStream << "ad_score: " << ad_score << ", l_threshold: " << l_threshold << std::endl;
@@ -740,7 +683,6 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
740683
n_outliers += 1;
741684

742685
}
743-
//}
744686
else {
745687
//Capture maximum of one normal execution per io step
746688
itt->set_label(1);
@@ -779,8 +721,15 @@ int ADOutlierCOPOD::np_digitize_get_bin_inds(const double& X, const std::vector<
779721
return ret_val;
780722
}
781723

782-
auto ADOutlierCOPOD::empiricalCDF(const std::vector<double>& runtimes, const bool sorted) {
724+
std::vector<double> ADOutlierCOPOD::empiricalCDF(const std::vector<double>& runtimes, const bool sorted) {
725+
726+
std::vector<double> tmp_runtimes = runtimes;
727+
auto ecdf = boost::math::empirical_cumulative_distribution_function(std::move(tmp_runtimes));
728+
std::vector<double> result_ecdf = std::vector<double>(runtimes.size(), 0.0);
729+
for(int i=0; i < runtimes.size(); i++) {
730+
result_ecdf.at(i) = ecdf(runtimes.at(i));
731+
}
783732

784-
return boost::math::empirical_cumulative_distribution_function(std::move(runtimes), sorted);
733+
return result_ecdf;
785734

786735
}

test/unit_tests/ad/Makefile.am

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/3rdparty @PS_FLAGS@
22
LDADD = $(top_builddir)/src/libchimbuko.la -lgtest -lstdc++fs
33

44
testdir = $(prefix)/test/unit_tests/ad
5-
test_PROGRAMS = HBOSOutlier HBOSOutlierADs HBOSOutlierDistributions HBOSOutlierTestBPFile SSTDOutlierTestBPFile ADEvent ADio ADParser ADOutlier ADNetClient ADLocalFuncStatistics ADMetadataParser ADCounter ADLocalCounterStatistics ADAnomalyProvenance ADglobalFunctionIndexMap ADNormalEventProvenance utils AnomalyData ADcombinedPSdata FuncAnomalyMetrics ADLocalAnomalyMetrics
5+
test_PROGRAMS = COPODOutlier COPODOutlierADs HBOSOutlier HBOSOutlierADs HBOSOutlierDistributions HBOSOutlierTestBPFile SSTDOutlierTestBPFile ADEvent ADio ADParser ADOutlier ADNetClient ADLocalFuncStatistics ADMetadataParser ADCounter ADLocalCounterStatistics ADAnomalyProvenance ADglobalFunctionIndexMap ADNormalEventProvenance utils AnomalyData ADcombinedPSdata FuncAnomalyMetrics ADLocalAnomalyMetrics
66

77
HBOSOutlier_SOURCES = HBOSOutlier.cpp ../unit_test_main_mpi.cpp
88
HBOSOutlier_LDADD = $(LDADD)

0 commit comments

Comments
 (0)