Skip to content

Commit a6ae0ac

Browse files
committed
SSTD algorithm score is now the relative number of standard deviations from mean rather than a p-value, as all non-trivial outliers have a p-value that is 0 to many sig figs!
Added a "severity" score to the outliers in the provenance database (currently the anomaly runtime) that allows for sorting anomalies by impact on application Added function to simulated AD to update the runtime on an existing exec, allowing for easier trace generation for nested functions Like the real pserver, the simulated pserver now only outputs stats data in each period if any data has been collected. Output data is now "pretty printed". Added a new example simulator main program with manually tagged anomalies for a multi-thread, multi-rank program Added a perl script to locate unmatched correlation IDs in a sst_view dump on an ADIOS2 binary
1 parent ebf3918 commit a6ae0ac

9 files changed

Lines changed: 230 additions & 6 deletions

File tree

scripts/find_unmatched_corids.pl

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/usr/bin/perl
2+
3+
$ARGC=scalar @ARGV;
4+
if($ARGC != 1){
5+
print "Need dump of ADIOS2 binary";
6+
exit(-1);
7+
}
8+
9+
10+
11+
open(IN,$ARGV[0]);
12+
$idx=-1;
13+
$active=0;
14+
$ndata=-1;
15+
$i=0;
16+
17+
%corid;
18+
19+
while(<IN>){
20+
$line=$_;
21+
if($line=~m/counter\s(\d+)\s.*Correlation ID/){
22+
$idx=$1;
23+
print "Found counter index $idx\n";
24+
}elsif($line=~m/counter_values.*Parsed size (\d+)/){
25+
$active=1;
26+
$ndata=$1;
27+
$i=0;
28+
print "Found counter data block of size $ndata\n";
29+
}elsif($active == 1){
30+
if($line=~m/3 \):${idx} \(\d+ 4 \):(\d+)/){
31+
if(exists $corid{$1}){
32+
delete $corid{$1};
33+
print "Matched $1\n";
34+
}else{
35+
$corid{$1} = 1;
36+
print "Added $1\n";
37+
}
38+
39+
40+
#print "$i $idx $1\n";
41+
42+
}
43+
#counter_values {AvailableStepsCount:15 Max:1627915315912147 Min:0 Shape:188, 6 SingleValue:false Type:uint64_t } Parsed size 188 6
44+
45+
46+
$i++;
47+
if($i == $ndata){
48+
print "Ending data block on line $i\n";
49+
$active=0;
50+
}
51+
}
52+
53+
#counter 0 {Elements:1 Type:string Value:"Correlation ID" }
54+
55+
#/bld/benchmark_suite/cupti_gpu_kernel_outlier/chimbuko/adios2/data
56+
}
57+
close(IN);
58+
59+
$nunmatched=scalar keys %corid;
60+
print "--------$nunmatched unmatched corids\n";
61+
62+
foreach $id (keys %corid){
63+
print "--------Unmatched $id\n";
64+
}

sim/include/sim/ad.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,15 @@ namespace chimbuko_sim{
9090
bool is_anomaly = false,
9191
double outlier_score = 0.);
9292

93+
/**
94+
* @brief Modify the runtime of an existing execution
95+
* @param exec_it The iterator to the execution (output of previous addExec)
96+
* @param new_runtime The new function runtime
97+
*/
98+
void updateRuntime(CallListIterator_t exec_it,
99+
unsigned long new_runtime);
100+
101+
93102
/**
94103
* @brief Attach a counter to an execution t_delta us after the start of the execution
95104
* @param counter_name The counter name

sim/main/Makefile.am

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,13 @@ AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/3rdparty -I$(top_srcdir)/s
22
LDADD = ../src/libchimbuko_sim.la ../../src/libchimbuko.la -lstdc++fs
33

44
simmaindir = $(prefix)/sim/main
5-
simmain_PROGRAMS = example1 example2
5+
simmain_PROGRAMS = example1 example2 example3
66

77
example1_SOURCES = example1.cpp
88
example1_LDADD = $(LDADD)
99

1010
example2_SOURCES = example2.cpp
1111
example2_LDADD = $(LDADD)
12+
13+
example3_SOURCES = example3.cpp
14+
example3_LDADD = $(LDADD)

sim/main/example3.cpp

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
//This example demonstrates the simulator using the ADSim API directly with multiple ranks and multiple threads per rank
2+
//Anomalies are explicitly tagged
3+
#include<mpi.h>
4+
#include<sim.hpp>
5+
#include<random>
6+
7+
using namespace chimbuko_sim;
8+
9+
int main(int argc, char **argv){
10+
MPI_Init(&argc, &argv);
11+
12+
int window_size = 5; //number of events to record around an anomaly in the provenance data
13+
int pid = 0; //program index
14+
unsigned long program_start = 100;
15+
unsigned long step_freq = 1000;
16+
17+
//Setup the "AD" instances
18+
int n_ranks = 4;
19+
int threads_per_rank = 2;
20+
21+
std::vector<ADsim> ad;
22+
for(int r=0;r<n_ranks;r++)
23+
ad.push_back(ADsim(window_size, pid, r, program_start, step_freq));
24+
25+
//Setup some functions
26+
registerFunc("main", 500, 50, 100);
27+
registerFunc("child", 250, 25, 100);
28+
29+
int anom_runtimes[3] = {500,750,1000};
30+
double anom_scores[3] = {2,3,4};
31+
32+
//Setup some traces
33+
int nexec = 500;
34+
int anom_freq = 50;
35+
36+
for(int rank=0;rank<n_ranks;rank++){
37+
for(int thread=0;thread<threads_per_rank;thread++){
38+
int anom_off = 0;
39+
40+
unsigned long child_start = 200;
41+
42+
unsigned long time = program_start;
43+
auto main = ad[rank].addExec(thread, "main", time, 0); //will update the end time later
44+
for(int i=0;i<nexec;i++){
45+
CallListIterator_t child;
46+
unsigned long runtime;
47+
if(i!=0 && i % anom_freq == 0){ //an anomaly!
48+
runtime = anom_runtimes[anom_off];
49+
double score = anom_scores[anom_off];
50+
child = ad[rank].addExec(thread, "child", time, runtime, true, score);
51+
52+
std::cout << rank << " " << thread << " " << i << " anomaly " << runtime << std::endl;
53+
54+
anom_off = (anom_off + 1) % 3; //cycle through 3 times
55+
}else{
56+
runtime = 250;
57+
child = ad[rank].addExec(thread, "child", time, runtime);
58+
59+
std::cout << rank << " " << thread << " " << i << " normal " << runtime << std::endl;
60+
}
61+
ad[rank].bindParentChild(main, child);
62+
63+
time = time + runtime + 1;
64+
}
65+
ad[rank].updateRuntime(main, time - program_start);
66+
}
67+
}
68+
69+
//Run the simulation
70+
int nstep = ad[0].largest_step() + 1;
71+
for(int r=1;r<n_ranks;r++)
72+
nstep = std::max( (unsigned long)nstep, ad[r].largest_step() + 1);
73+
74+
std::cout << "Got data for " << nstep << " steps" << std::endl;
75+
76+
for(int i=0;i<nstep;i++){
77+
for(int r=0;r<n_ranks;r++){
78+
std::cout << "Rank " << r << " step " << i << " contains " << ad[r].nStepExecs(i) << " execs" << std::endl;
79+
ad[r].step(i);
80+
}
81+
//PServer dump its output
82+
getPserver().writeStreamingOutput();
83+
}
84+
85+
MPI_Finalize();
86+
std::cout << "Done" << std::endl;
87+
return 0;
88+
}

sim/src/ad.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,56 @@ CallListIterator_t ADsim::addExec(const int thread,
104104
}
105105

106106

107+
void ADsim::updateRuntime(CallListIterator_t exec_it,
108+
unsigned long new_runtime){
109+
110+
unsigned long old_exit = exec_it->get_exit();
111+
unsigned long old_exit_step = ( old_exit - m_program_start ) / m_step_freq;
112+
113+
//Remove the exec from the old step
114+
bool found = false;
115+
auto &old_step_list = m_step_execs[old_exit_step];
116+
for(auto old_step_it = old_step_list.begin(); old_step_it != old_step_list.end(); old_step_it++){
117+
if( *old_step_it == exec_it ){
118+
old_step_list.erase(old_step_it);
119+
found = true;
120+
break;
121+
}
122+
}
123+
if(!found) fatal_error("Could not find iterator in old step list!");
124+
125+
unsigned long new_exit = exec_it->get_entry() + new_runtime;
126+
unsigned long new_exit_step = ( new_exit - m_program_start ) / m_step_freq;
127+
128+
exec_it->update_exclusive(old_exit); //hacky way to reset exit time
129+
exec_it->update_exit(new_exit);
130+
131+
m_largest_step = std::max(m_largest_step, new_exit_step);
132+
133+
//Determine where to insert the entry
134+
auto &new_step_list = m_step_execs[new_exit_step];
135+
bool inserted = false;
136+
137+
if(new_step_list.size() > 1){ //if >2 events the execution might be between existing events
138+
unsigned long prev_exit = (*new_step_list.begin())->get_exit();
139+
for(auto new_step_it = std::next(new_step_list.begin(),1); new_step_it != new_step_list.end(); new_step_it++){
140+
unsigned long cur_exit = (*new_step_it)->get_exit();
141+
if(new_exit >= prev_exit && new_exit <= cur_exit){ //some ambiguity for 0 runtime functions here
142+
new_step_list.insert(new_step_it, exec_it);
143+
inserted = true;
144+
break;
145+
}
146+
prev_exit = cur_exit;
147+
}
148+
}
149+
150+
if(!inserted)
151+
new_step_list.push_back(exec_it);
152+
}
153+
154+
155+
156+
107157
void ADsim::attachCounter(const std::string &counter_name,
108158
unsigned long value,
109159
long t_delta,

sim/src/ad_params.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ using namespace chimbuko_sim;
77

88
void chimbuko_sim::registerFunc(const std::string &func_name, unsigned long mean_runtime, unsigned long std_dev_runtime, int seen_count){
99
if(adAlgorithmParams().algorithm != "none") fatal_error("This function should only be used if ad_algorithm()==\"none\"");
10+
if(seen_count == 0) fatal_error("seen_count must be >0 to generate the fake function statistics");
1011
unsigned long fidx = registerFunc(func_name);
1112

1213
//Generate fake statistics

sim/src/pserver.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ void pserverSim::writeStreamingOutput() const{
1414
std::ostringstream fname; fname << "pserver_output_stats_" << iter << ".json";
1515
++iter;
1616

17-
std::ofstream out(fname.str()); out << json_packet.dump();
17+
if(json_packet.size() != 0){ //only write if there is anything to write! (it writes "null" to the file otherwise)
18+
std::ofstream out(fname.str()); out << json_packet.dump(4);
19+
}
1820
}
1921

2022
pserverSim::pserverSim(): anomaly_stats_payload(&global_func_stats), counter_stats_payload(&global_counter_stats), m_ad_params(nullptr){

src/ad/ADAnomalyProvenance.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,8 @@ nlohmann::json ADAnomalyProvenance::get_json() const{
179179
{"io_step_tstart", m_io_step_tstart},
180180
{"io_step_tend", m_io_step_tend},
181181
{"outlier_score", m_call.get_outlier_score() },
182-
{"hostname", getHostname() }
182+
{"outlier_severity", m_call.get_runtime() },
183+
{"hostname", getHostname() }
183184
};
184185
}
185186

src/ad/ADOutlier.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,15 @@ double ADOutlierSSTD::computeScore(CallListIterator_t ev, const SstdParam &stats
148148
double mean = it->second.mean();
149149
double std_dev = it->second.stddev();
150150
if(std_dev == 0.) std_dev = 1e-10; //distribution throws an error if std.dev = 0
151-
boost::math::normal_distribution<double> dist(mean, std_dev);
152-
double cdf_val = boost::math::cdf(dist, runtime); // P( X <= x ) for random variable X
153-
double score = std::min(cdf_val, 1-cdf_val); //two-tailed
151+
152+
//boost::math::normal_distribution<double> dist(mean, std_dev);
153+
//double cdf_val = boost::math::cdf(dist, runtime); // P( X <= x ) for random variable X
154+
//double score = std::min(cdf_val, 1-cdf_val); //two-tailed
155+
156+
//Using the CDF gives scores ~0 for basically any global outlier
157+
//Instead we will use the difference in runtime compared to the avg in units of the standard deviation
158+
double score = fabs( runtime - mean ) / std_dev;
159+
154160
verboseStream << "ADOutlierSSTD::computeScore " << ev->get_funcname() << " runtime " << runtime << " mean " << mean << " std.dev " << std_dev << " -> score " << score << std::endl;
155161
return score;
156162
}

0 commit comments

Comments
 (0)