Skip to content

Commit dfb7101

Browse files
committed
Merge branch 'ckelly_develop' into provdb_multiprovider
2 parents 75a6381 + 87fe054 commit dfb7101

11 files changed

Lines changed: 176 additions & 59 deletions

File tree

include/chimbuko/ad/ADOutlier.hpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ namespace chimbuko {
238238

239239
private:
240240
double m_alpha; /**< Used to prevent log2 overflow */
241-
double m_threshold; /**< Threshold used to filter anomalies in HBOS*/
241+
double m_threshold; /**< Threshold used to filter anomalies in COPOD*/
242242
bool m_use_global_threshold; /**< Flag to use global threshold*/
243243
//double m_threshold; /** sync with global threshold */
244244
OutlierStatistic m_statistic; /** Which statistic to use for outlier detection */
@@ -253,13 +253,13 @@ namespace chimbuko {
253253
public:
254254

255255
/**
256-
* @brief Construct a new ADOutlierHBOS object
256+
* @brief Construct a new ADOutlierCOPOD object
257257
*
258258
*/
259259
ADOutlierCOPOD(OutlierStatistic stat = ExclusiveRuntime, double threshold = 0.99, bool use_global_threshold = true);
260260

261261
/**
262-
* @brief Destroy the ADOutlierHBOS object
262+
* @brief Destroy the ADOutlierCOPOD object
263263
*
264264
*/
265265
~ADOutlierCOPOD();
@@ -308,11 +308,12 @@ namespace chimbuko {
308308

309309
private:
310310
double m_alpha; /**< Used to prevent log2 overflow */
311-
double m_threshold; /**< Threshold used to filter anomalies in HBOS*/
311+
double m_threshold; /**< Threshold used to filter anomalies in COPOD*/
312312
bool m_use_global_threshold; /**< Flag to use global threshold*/
313-
//double m_threshold; /** sync with global threshold */
314-
OutlierStatistic m_statistic; /** Which statistic to use for outlier detection */
313+
//double m_threshold; /**< sync with global threshold */
314+
OutlierStatistic m_statistic; /**< Which statistic to use for outlier detection */
315315

316+
std::unordered_map<unsigned long, int> m_skewness; /**< skewness for each function*/
316317
};
317318

318319

include/chimbuko/ad/ExecData.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,7 @@ class ExecData_t {
489489
/**
490490
* @brief Return the outlier severity, representing how important the outlier is
491491
*/
492-
double get_outlier_severity() const{ return (double)get_runtime(); }
492+
double get_outlier_severity() const{ return (double)get_exclusive(); }
493493

494494
/**
495495
* @brief Set the label

src/ad/ADOutlier.cpp

Lines changed: 104 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -266,16 +266,26 @@ Anomalies ADOutlierHBOS::run(int step) {
266266
}
267267
if (runtimes.size() > 0) {
268268
if (!g.find(func_id)) { // If func_id does not exist
269-
const int r = param[func_id].create_histogram(runtimes);
270-
if (r < 0) {return outliers;}
269+
270+
const int r = param[func_id].create_histogram(runtimes);
271+
if (r < 0) {
272+
recoverable_error(std::string("AD: Func_ID does not exist"));
273+
continue;
274+
}
271275
}
272276
else { //merge with exisiting func_id, not overwrite
273277

274278
const int r = param[func_id].merge_histograms(g[func_id], runtimes);
275-
if (r < 0) {return outliers;}
279+
if (r < 0) {
280+
recoverable_error(std::string("AD: Merging error received "));
281+
continue;
282+
}
276283
}
277284
}
278-
else { return outliers;}
285+
else {
286+
//recoverable_error(std::string("AD: Zero function runtimes "));
287+
continue;
288+
}
279289
}
280290

281291
//Update temp runstats to include information collected previously (synchronizes with the parameter server if connected)
@@ -543,15 +553,42 @@ Anomalies ADOutlierCOPOD::run(int step) {
543553
if (runtimes.size() > 0) {
544554
if (!g.find(func_id)) { // If func_id does not exist
545555
const int r = param[func_id].create_histogram(runtimes);
546-
if (r < 0) {return outliers;}
556+
if (r < 0) {
557+
recoverable_error(std::string("AD: Func_ID does not exist "));
558+
continue;
559+
}
547560
}
548561
else { //merge with exisiting func_id, not overwrite
549562

550563
const int r = param[func_id].merge_histograms(g[func_id], runtimes);
551-
if (r < 0) {return outliers;}
564+
if (r < 0) {
565+
recoverable_error(std::string("AD: Merging error received "));
566+
continue;
567+
}
552568
}
553569
}
554-
else { return outliers;}
570+
else {
571+
///recoverable_error(std::string("AD: Zero function runtimes "));
572+
continue;
573+
}
574+
verboseStream << "Size of runtimes: " << runtimes.size() << ", func_id: " << func_id << std::endl;
575+
576+
//calculate skewness of runtimes for func_id
577+
const double mu = std::accumulate(runtimes.begin(), runtimes.end(), 0.0) / runtimes.size();
578+
579+
std::vector<double> diff = std::vector<double>(runtimes.size());
580+
std::transform(runtimes.begin(), runtimes.end(), diff.begin(), [mu](double x) {return x - mu;});
581+
const double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
582+
const double stdev = std::sqrt(sq_sum / runtimes.size());
583+
584+
double summ = 0;
585+
for (int i=0; i<runtimes.size(); i++) {
586+
summ += std::pow((runtimes.at(i) - mu), 3);
587+
}
588+
589+
const double abs_skewness = summ / ((runtimes.size() - 1) * std::pow(stdev,3));
590+
m_skewness[func_id] = (abs_skewness < 0) ? -1 : (abs_skewness > 0) ? 1 : 0;
591+
555592
}
556593

557594
//Update temp runstats to include information collected previously (synchronizes with the parameter server if connected)
@@ -579,6 +616,7 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
579616
std::vector<CallListIterator_t>& data){
580617

581618
verboseStream << "Finding outliers in events for func " << func_id << std::endl;
619+
verboseStream << "data Size: " << data.size() << std::endl;
582620

583621
CopodParam& param = *(CopodParam*)m_param;
584622

@@ -589,47 +627,72 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
589627
//std::vector<double> prob_counts = std::vector<double>(param[func_id].counts().size(), 0.0);
590628
double tot_runtimes = std::accumulate(param[func_id].counts().begin(), param[func_id].counts().end(), 0.0);
591629

630+
if (tot_runtimes <= 0 ) {
631+
return n_outliers;
632+
}
592633
std::vector<double> recon_p_runtimes = std::vector<double>(tot_runtimes, 0.0);
593634
std::vector<double> recon_n_runtimes = std::vector<double>(tot_runtimes, 0.0);
594635
int recon_idx = 0;
595-
//verboseStream << "Unwrapping Merged Histogram. Size: " << param[func_id].counts().size() << std::endl;
636+
verboseStream << "Unwrapping Merged Histogram. Size: " << param[func_id].counts().size() << std::endl;
596637
for(int i=0; i < param[func_id].counts().size(); i++){
597638
int count = param[func_id].counts().at(i);
598-
//verboseStream << "Count: " << count << ", Value: " << param[func_id].bin_edges().at(i) << std::endl;
639+
verboseStream << "Count: " << count << ", Value: " << param[func_id].bin_edges().at(i) << std::endl;
599640
for(int j=0; j<count; j++){
600641

601642
recon_p_runtimes.at(recon_idx) = param[func_id].bin_edges().at(i);
602643
recon_n_runtimes.at(recon_idx) = -1 * param[func_id].bin_edges().at(i);
603-
//verboseStream << "recon_p_runtimes.at(recon_idx): " << recon_p_runtimes.at(recon_idx) << ", recon_n_runtimes.at(recon_idx): " << recon_n_runtimes.at(recon_idx) << std::endl;
604-
//verboseStream << "recon_idx: " << recon_idx << std::endl;
644+
verboseStream << "recon_idx: " << recon_idx << std::endl;
645+
verboseStream << "recon_p_runtimes.at(recon_idx): " << recon_p_runtimes.at(recon_idx) << ", recon_n_runtimes.at(recon_idx): " << recon_n_runtimes.at(recon_idx) << std::endl;
605646
recon_idx++;
606647
}
607648
}
608649

650+
609651
std::vector<double> func_p_ecdf = empiricalCDF(recon_p_runtimes, true);
610652
std::vector<double> func_n_ecdf = empiricalCDF(recon_n_runtimes, true);
653+
654+
verboseStream << "Size of empiricalCDF(recon_p_runtimes): " << func_p_ecdf.size() << std::endl;
655+
verboseStream << "Size of empiricalCDF(recon_n_runtimes): " << func_n_ecdf.size() << std::endl;
611656

612657
std::vector<double> mean_pn_ecdf = std::vector<double>(func_p_ecdf.size(), 0.0);
658+
verboseStream << "Size of mean_pn_ecdf: " << mean_pn_ecdf.size() << ", func_id: " << func_id << std::endl;
659+
613660
for(int i=0; i < mean_pn_ecdf.size(); i++){
614-
mean_pn_ecdf.at(i) = (func_p_ecdf.at(i) + func_n_ecdf.at(i)) / 2;
661+
mean_pn_ecdf.at(i) = (func_p_ecdf.at(i) + func_n_ecdf.at(i)) / 2.0;
662+
verboseStream << "mean_pn_ecdf.at(i): " << mean_pn_ecdf.at(i) << ", func_p_ecdf.at(i): " << func_p_ecdf.at(i) << ", func_n_ecdf.at(i): " << func_n_ecdf.at(i) << std::endl;
615663
}
616664

665+
//use skewness
666+
std::vector<double> skewness_arr = std::vector<double>(func_p_ecdf.size(), 0.0);
667+
const int p_sign = (m_skewness[func_id] - 1) < 0 ? -1 : (m_skewness[func_id] - 1) > 0 ? 1 : 0;
668+
const int n_sign = (m_skewness[func_id] + 1) < 0 ? -1 : (m_skewness[func_id] + 1) > 0 ? 1 : 0;
669+
670+
for (int i = 0; i< func_p_ecdf.size(); i++) {
671+
skewness_arr.at(i) = (func_p_ecdf.at(i) * -1 * p_sign) + (func_n_ecdf.at(i) * n_sign);
672+
verboseStream << "skewness_arr.at(" << i << "): " << skewness_arr.at(i) << std::endl;
673+
}
617674

618-
//for(int i=0; i < param[func_id].counts().size(); i++){
619-
// int count = param[func_id].counts().at(i);
620-
// double p = count / tot_runtimes;
621-
// prob_counts.at(i) += p;
622-
//}
675+
std::vector<double> final_comp = std::vector<double>(skewness_arr.size(), 0.0);
676+
for (int i = 0; i < skewness_arr.size(); i++) {
677+
final_comp.at(i) = std::max(skewness_arr.at(i), mean_pn_ecdf.at(i));
678+
}
679+
680+
681+
//for(int i=0; i<mean_pn_ecdf.size(); i++)
682+
// verboseStream << "mean_pn_ecdf at " << i << ": " << mean_pn_ecdf.at(i) << std::endl;
623683

624684
//Create COPOD score vector
625-
std::vector<double> out_scores_i;
685+
std::vector<double> out_scores_i = std::vector<double>(final_comp.size(), 0.0);
686+
verboseStream << "m_alpha: " << m_alpha << std::endl;
687+
626688
double min_score = -1 * log2(0.0 + m_alpha);
627-
double max_score = -1 * log2(1.0 + m_alpha);
689+
double max_score = log2(1.0 + m_alpha) - min_score;
690+
verboseStream << "Initializaing min_score: " << min_score << ", max_score: " << max_score <<std::endl;
628691
verboseStream << "out_scores_i: " << std::endl;
629-
for(int i=0; i < mean_pn_ecdf.size(); i++){
630-
double l = -1 * log2(mean_pn_ecdf.at(i) + m_alpha);
631-
out_scores_i.push_back(l);
632-
//verboseStream << "Count: " << param[func_id].counts().at(i) << ", Probability: " << prob_counts.at(i) << ", score: "<< l << std::endl;
692+
for(int i=0; i < final_comp.size(); i++){
693+
double l = -1 * log2(final_comp.at(i) + m_alpha);
694+
out_scores_i.at(i) = l;
695+
verboseStream << "Final_comp at " << i << ": " << final_comp.at(i) << ", score: "<< l << std::endl;
633696
//if(prob_counts.at(i) > 0) {
634697
if(l < min_score){
635698
min_score = l;
@@ -648,9 +711,10 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
648711

649712
//compute threshold
650713
verboseStream << "Global threshold before comparison with local threshold = " << param[func_id].get_threshold() << std::endl;
651-
double l_threshold = min_score + (m_threshold * (max_score - min_score));
714+
double l_threshold = (max_score < 0) ? (-1 * m_threshold * (max_score - min_score)) : min_score + (m_threshold * (max_score - min_score));
715+
verboseStream << "l_threshold computed: " << l_threshold << std::endl;
652716
if(m_use_global_threshold) {
653-
if(l_threshold < param[func_id].get_threshold()) {
717+
if(l_threshold < param[func_id].get_threshold() && param[func_id].get_threshold() > (-1 * log2(1.00001))) {
654718
l_threshold = param[func_id].get_threshold();
655719
} else {
656720
param[func_id].set_glob_threshold(l_threshold); //.get_histogram().glob_threshold = l_threshold;
@@ -659,9 +723,6 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
659723
}
660724

661725
//Compute COPOD based score for each datapoint
662-
//const double bin_width = param[func_id].bin_edges().at(1) - param[func_id].bin_edges().at(0);
663-
//const int num_bins = param[func_id].counts().size();
664-
//verboseStream << "Bin width: " << bin_width << std::endl;
665726

666727
int top_out = 0;
667728
int running_idx = 0;
@@ -670,12 +731,21 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
670731

671732
const double runtime_i = this->getStatisticValue(*itt); //runtimes.push_back(this->getStatisticValue(*itt));
672733
double ad_score;
673-
674-
//verboseStream << "mean_pn_ecdf.at(running_idx++): " << mean_pn_ecdf.at(running_idx) << std::endl;
675-
if (mean_pn_ecdf.at(running_idx++) < 0.99)
676-
ad_score = l_threshold + 1;
677-
else
678-
ad_score = l_threshold - 1;
734+
735+
if (running_idx < final_comp.size()) {
736+
verboseStream << "final_comp.at(" << running_idx << "): " << final_comp.at(running_idx) << ", func_id: " << func_id << ", runtime: " << runtime_i << std::endl;
737+
738+
if (out_scores_i.at(running_idx) > l_threshold) //(final_comp.at(running_idx) > 0) // < 0.9)
739+
ad_score = l_threshold + 1;
740+
else
741+
ad_score = l_threshold - 1;
742+
743+
running_idx++;
744+
}
745+
else {
746+
recoverable_error("AD: COPOD: runtime Index");
747+
continue;
748+
}
679749

680750
itt->set_outlier_score(ad_score);
681751
verboseStream << "ad_score: " << ad_score << ", l_threshold: " << l_threshold << std::endl;

src/pserver/AggregateFuncStats.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ using namespace chimbuko;
44

55
AggregateFuncStats::AggregateFuncStats(int pid, int fid, const std::string &func): m_pid(pid), m_fid(fid), m_func(func){
66
m_func_anomaly.set_do_accumulate(true); //accumulate number of anomalies as 'accumulate' field
7+
m_inclusive.set_do_accumulate(true); //also accumulate total runtimes over all rank, thread
8+
m_exclusive.set_do_accumulate(true);
79
}
810

911
void AggregateFuncStats::add(unsigned long n_anomaly, const RunStats& inclusive, const RunStats& exclusive){

src/util/RunStats.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,8 +199,11 @@ RunStats chimbuko::operator+(const RunStats &a, const RunStats &b)
199199
bool do_accumulate = false;
200200
if (a.m_do_accumulate || b.m_do_accumulate)
201201
{
202-
sum_acc = a.m_state.acc + b.m_state.acc;
203-
do_accumulate = true;
202+
double a_acc = a.m_do_accumulate ? a.m_state.acc : a.m_state.eta * a.m_state.count; //reconstruct sum
203+
double b_acc = b.m_do_accumulate ? b.m_state.acc : b.m_state.eta * b.m_state.count;
204+
sum_acc = a_acc + b_acc;
205+
//sum_acc = a.m_state.acc + b.m_state.acc;
206+
do_accumulate = true;
204207
}
205208

206209
RunStats combined(do_accumulate);

test/test_param.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ TEST(GlobalAnomalyStatsTest, AnomalyStatTest2)
289289
EXPECT_EQ(0, param.collect_func_data().size());
290290

291291
std::unordered_map<unsigned long, RunStats> g_inclusive, g_exclusive;
292-
RunStats l_inclusive, l_exclusive;
292+
RunStats l_inclusive(true), l_exclusive(true); //accumulate total runtimes
293293

294294
l_inclusive.clear();
295295
l_inclusive.push(10); l_inclusive.push(10.5); l_inclusive.push(9.5);

test/unit_tests/ad/COPODOutlierADs.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class ADOutlierCOPODTest: public ADOutlierCOPOD{
2828
};
2929

3030

31-
TEST(HBOSADOutlierTestSyncParamWithPSComputeOutliers, Works){
31+
TEST(COPODADOutlierTestSyncParamWithPSComputeOutliers, Works){
3232
CopodParam global_params_ps; //parameters held in the parameter server
3333
CopodParam local_params_ad, local_params_ad2; //parameters collected by AD
3434

@@ -49,7 +49,7 @@ TEST(HBOSADOutlierTestSyncParamWithPSComputeOutliers, Works){
4949
std::vector<double> runtimes;
5050
Histogram &r = local_params_ad[0];
5151
for(int i=0;i<N;i++) {
52-
double val = i==N-1 ? 1000 : double(dist(gen));
52+
double val = i==N-1 ? 10000 : double(dist(gen));
5353
call_list.push_back( createFuncExecData_t(0,0,0, 0, "my_func", 1000*(i+1), val) );
5454
runtimes.push_back(val);
5555
std::cout << "vals in localhist 1: " << val << std::endl;
@@ -65,7 +65,7 @@ TEST(HBOSADOutlierTestSyncParamWithPSComputeOutliers, Works){
6565
std::vector<double> runtimes;
6666
Histogram &r = local_params_ad2[0];
6767
for(int i=0;i<N;i++) {
68-
double val = i==N-1 ? 1000 : double(dist(gen));
68+
double val = i==N-1 ? 20000 : double(dist(gen));
6969
call_list2.push_back( createFuncExecData_t(0,0,0, 0, "my_func", 1000*(i+1), val) );
7070
runtimes.push_back(val);
7171
std::cout << "vals in localhist 2: " << val << std::endl;

test/unit_tests/ad/HBOSOutlierTestBPFile.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ TEST(HBOSADOutlierBPFileWithoutPServer, Works) {
115115
//Parameters for the connection to the instrumented binary trace output
116116
params.trace_engineType = "BPFile"; // BPFile or SST
117117
params.trace_data_dir = trace_data_dir; // *.bp location
118-
std::string bp_prefix = "tau-metrics"; // bp file prefix (e.g. tau-metrics-[nwchem])
118+
std::string bp_prefix = "tau-metrics"; //"tau-metrics-xgc-es-cpp-gpu"; // bp file prefix (e.g. tau-metrics-[nwchem])
119119

120120
//The remainder are optional arguments. Enable using the appropriate command line switch
121121
params.program_idx = 0;

test/unit_tests/ad/SSTDOutlierTestBPFile.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ TEST(SSTDADOutlierBPFileWithoutPServer, Works) {
9797
if(_argc < 2){
9898
throw std::runtime_error("Path to trace data directory must be provided as an argument!");
9999
}
100-
std::string trace_data_dir = _argv[1];
100+
std::string trace_data_dir = _argv[1]; //"/home/src/chimbuko_ad/96rank_98step_traces_11_8_21/"; //_argv[1];
101101

102102
//int file_suffix = 1;
103103
int ranks = 4;
@@ -116,7 +116,7 @@ TEST(SSTDADOutlierBPFileWithoutPServer, Works) {
116116
//Parameters for the connection to the instrumented binary trace output
117117
params.trace_engineType = "BPFile"; // argv[1]; // BPFile or SST
118118
params.trace_data_dir = trace_data_dir; // *.bp location
119-
std::string bp_prefix = "tau-metrics"; //argv[3]; // bp file prefix (e.g. tau-metrics-[nwchem])
119+
std::string bp_prefix = "tau-metrics"; //"tau-metrics-xgc-es-cpp-gpu"; //argv[3]; // bp file prefix (e.g. tau-metrics-[nwchem])
120120

121121
//The remainder are optional arguments. Enable using the appropriate command line switch
122122
params.program_idx = 0;

0 commit comments

Comments
 (0)