@@ -587,43 +587,52 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
587587 unsigned long n_outliers = 0 ;
588588
589589 // probability of runtime counts
590- std::vector<double > prob_counts = std::vector<double >(param[func_id].counts ().size (), 0.0 );
590+ // std::vector<double> prob_counts = std::vector<double>(param[func_id].counts().size(), 0.0);
591591 double tot_runtimes = std::accumulate (param[func_id].counts ().begin (), param[func_id].counts ().end (), 0.0 );
592592
593- std::vector<double > recon_runtimes = std::vector<double >(tot_runtimes, 0.0 );
593+ std::vector<double > recon_p_runtimes = std::vector<double >(tot_runtimes, 0.0 );
594+ std::vector<double > recon_n_runtimes = std::vector<double >(tot_runtimes, 0.0 );
594595 int recon_idx = 0 ;
595596 for (int i=0 ; i < param[func_id].counts ().size (); i++){
596597 int count = param[func_id].counts ().at (i);
597598 for (int j=0 ; j<count; j++){
598- recon_runtimes.at (recon_idx++) = param[func_id].bin_edges .at (i);
599+ recon_p_runtimes.at (recon_idx++) = param[func_id].bin_edges ().at (i);
600+ recon_n_runtimes.at (recon_idx++) = -1 * param[func_id].bin_edges ().at (i);
599601 }
600602 }
601603
602- auto func_ecdf = empiricalCDF (recon_runtimes, true );
604+ std::vector<double > func_p_ecdf = empiricalCDF (recon_p_runtimes, true );
605+ std::vector<double > func_n_ecdf = empiricalCDF (recon_n_runtimes, true );
603606
604- for (int i=0 ; i < param[func_id].counts ().size (); i++){
605- int count = param[func_id].counts ().at (i);
606- double p = count / tot_runtimes;
607- prob_counts.at (i) += p;
607+ std::vector<double > mean_pn_ecdf = std::vector<double >(func_p_ecdf.size (), 0.0 );
608+ for (int i=0 ; i < mean_pn_ecdf.size (); i++){
609+ mean_pn_ecdf.at (i) = (func_p_ecdf.at (i) + func_n_ecdf.at (i)) / 2 ;
608610 }
609611
612+
613+ // for(int i=0; i < param[func_id].counts().size(); i++){
614+ // int count = param[func_id].counts().at(i);
615+ // double p = count / tot_runtimes;
616+ // prob_counts.at(i) += p;
617+ // }
618+
610619 // Create COPOD score vector
611620 std::vector<double > out_scores_i;
612621 double min_score = -1 * log2 (0.0 + m_alpha);
613622 double max_score = -1 * log2 (1.0 + m_alpha);
614623 verboseStream << " out_scores_i: " << std::endl;
615- for (int i=0 ; i < prob_counts .size (); i++){
616- double l = -1 * log2 (prob_counts .at (i) + m_alpha);
624+ for (int i=0 ; i < mean_pn_ecdf .size (); i++){
625+ double l = -1 * log2 (mean_pn_ecdf .at (i) + m_alpha);
617626 out_scores_i.push_back (l);
618- verboseStream << " Count: " << param[func_id].counts ().at (i) << " , Probability: " << prob_counts.at (i) << " , score: " << l << std::endl;
619- if (prob_counts.at (i) > 0 ) {
627+ // verboseStream << "Count: " << param[func_id].counts().at(i) << ", Probability: " << prob_counts.at(i) << ", score: "<< l << std::endl;
628+ // if(prob_counts.at(i) > 0) {
620629 if (l < min_score){
621630 min_score = l;
622631 }
623632 if (l > max_score){
624633 max_score = l;
625634 }
626- }
635+ // }
627636 }
628637 verboseStream << std::endl;
629638 verboseStream << " out_score_i size: " << out_scores_i.size () << std::endl;
@@ -645,88 +654,22 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
645654 }
646655
647656 // Compute COPOD based score for each datapoint
648- const double bin_width = param[func_id].bin_edges ().at (1 ) - param[func_id].bin_edges ().at (0 );
649- const int num_bins = param[func_id].counts ().size ();
650- verboseStream << " Bin width: " << bin_width << std::endl;
657+ // const double bin_width = param[func_id].bin_edges().at(1) - param[func_id].bin_edges().at(0);
658+ // const int num_bins = param[func_id].counts().size();
659+ // verboseStream << "Bin width: " << bin_width << std::endl;
651660
652661 int top_out = 0 ;
662+ int running_idx = 0 ;
653663 for (auto itt : data) {
654664 if (itt->get_label () == 0 ) {
655665
656666 const double runtime_i = this ->getStatisticValue (*itt); // runtimes.push_back(this->getStatisticValue(*itt));
657667 double ad_score;
658-
659- const int bin_ind = ADOutlierCOPOD::np_digitize_get_bin_inds (runtime_i, param[func_id].bin_edges ());
660- verboseStream << " bin_ind: " << bin_ind << " for runtime_i: " << runtime_i << " , where bin_edges Size:" << param[func_id].bin_edges ().size () << " & num_bins: " << num_bins << std::endl;
661- /* *
662- * If the sample does not belong to any bins
663- * bin_ind == 0 (fall outside since it is too small)
664- */
665- if ( bin_ind == 0 ){
666- const double first_bin_edge = param[func_id].bin_edges ().at (0 );
667- const double dist = first_bin_edge - runtime_i;
668- verboseStream << " First_bin_edge: " << first_bin_edge << std::endl;
669- if ( dist <= (bin_width * 0.05 ) ){
670- verboseStream << runtime_i << " is on left of histogram but NOT outlier" << std::endl;
671- if (param[func_id].counts ().size () < 1 ) {return 0 ;}
672- if (param[func_id].counts ().at (0 ) == 0 ) { /* *< Ignore zero counts */
673-
674- ad_score = l_threshold - 1 ;
675- verboseStream << " corrected ad_score: " << ad_score << std::endl;
676- }
677- else {
678- ad_score = out_scores_i.at (0 );
679- verboseStream << " ad_score: " << ad_score << std::endl;
680- }
681- }
682- else {
683- verboseStream << runtime_i << " is on left of histogram and an outlier" << std::endl;
684- ad_score = max_score;
685- verboseStream << " ad_score(max_score): " << ad_score << std::endl;
686- }
687-
688- }
689- /* *
690- * If the sample does not belong to any bins
691- */
692- else if (bin_ind == num_bins + 1 ){
693- const int last_idx = param[func_id].bin_edges ().size () - 1 ;
694- const double last_bin_edge = param[func_id].bin_edges ().at (last_idx);
695- const double dist = runtime_i - last_bin_edge;
696- verboseStream << " last_indx: " << last_idx << " , last_bin_edge: " << last_bin_edge << std::endl;
697- if (dist <= (bin_width * 0.05 )) {
698- if (param[func_id].counts ().at (num_bins - 1 ) == 0 ) { // bin_ind) == 0) { /**< Ignore zero counts */
699-
700- ad_score = l_threshold - 1 ;
701- verboseStream << " corrected ad_score: " << ad_score << std::endl;
702- }
703- else {
704- verboseStream << runtime_i << " is on right of histogram but NOT outlier" << std::endl;
705- ad_score = out_scores_i.at (num_bins - 1 );
706- verboseStream << " ad_score: " << ad_score << " , num_bins: " << num_bins << " , out_scores_i size: " << out_scores_i.size () << std::endl;
707- }
708- }
709- else {
710- verboseStream << runtime_i << " is on right of histogram and an outlier" << std::endl;
711- ad_score = max_score;
712- verboseStream << " ad_score(max_score): " << ad_score << " , num_bins: " << num_bins << " , out_scores_i size: " << out_scores_i.size () << std::endl;
713- }
714-
715- }
716- else {
717-
718- if (param[func_id].counts ().at (bin_ind) == 0 ) { /* *< Ignore zero counts */
719-
720- ad_score = l_threshold - 1 ;
721- verboseStream << " corrected ad_score: " << ad_score << std::endl;
722- }
723- else {
724- verboseStream << runtime_i << " maybe be an outlier" << std::endl;
725- ad_score = out_scores_i.at ( bin_ind - 1 );
726- verboseStream << " ad_score(else): " << ad_score << " , bin_ind: " << bin_ind << " , num_bins: " << num_bins << " , out_scores_i size: " << out_scores_i.size () << std::endl;
727- }
728-
729- }
668+
669+ if (mean_pn_ecdf.at (running_idx++) > 0 )
670+ ad_score = l_threshold + 1 ;
671+ else
672+ ad_score = l_threshold - 1 ;
730673
731674 itt->set_outlier_score (ad_score);
732675 verboseStream << " ad_score: " << ad_score << " , l_threshold: " << l_threshold << std::endl;
@@ -740,7 +683,6 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
740683 n_outliers += 1 ;
741684
742685 }
743- // }
744686 else {
745687 // Capture maximum of one normal execution per io step
746688 itt->set_label (1 );
@@ -779,8 +721,15 @@ int ADOutlierCOPOD::np_digitize_get_bin_inds(const double& X, const std::vector<
779721 return ret_val;
780722}
781723
782- auto ADOutlierCOPOD::empiricalCDF (const std::vector<double >& runtimes, const bool sorted) {
724+ std::vector<double > ADOutlierCOPOD::empiricalCDF (const std::vector<double >& runtimes, const bool sorted) {
725+
726+ std::vector<double > tmp_runtimes = runtimes;
727+ auto ecdf = boost::math::empirical_cumulative_distribution_function (std::move (tmp_runtimes));
728+ std::vector<double > result_ecdf = std::vector<double >(runtimes.size (), 0.0 );
729+ for (int i=0 ; i < runtimes.size (); i++) {
730+ result_ecdf.at (i) = ecdf (runtimes.at (i));
731+ }
783732
784- return boost::math::empirical_cumulative_distribution_function ( std::move (runtimes), sorted) ;
733+ return result_ecdf ;
785734
786735}
0 commit comments