Skip to content

Commit 6493c84

Browse files
committed
Added comments and error checks to simulator
1 parent a2a392e commit 6493c84

2 files changed

Lines changed: 61 additions & 6 deletions

File tree

sim/include/sim/ad.hpp

Lines changed: 52 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ namespace chimbuko_sim{
1818
std::list<CallListIterator_t> m_step_exec_its; //iterators to events on this io step; flushed at end of step
1919
std::unordered_map<unsigned long, std::vector<CallListIterator_t>> m_step_func_exec_its; //map of function id to iterators on this io step; flushed at end of step
2020

21+
bool m_step_is_open; /**< Is an io step currently open? */
2122
int m_step;
2223
int m_window_size;
2324
int m_pid;
@@ -30,46 +31,91 @@ namespace chimbuko_sim{
3031
public:
3132
void init(int window_size, int pid, int rid);
3233

34+
/**
35+
* @brief Instantiate the AD simulator
36+
* @param window_size The number of events around an anomaly that are recorded in the provDB
37+
* @param pid The program index
38+
* @param rid The rank index
39+
*/
3340
ADsim(int window_size, int pid, int rid): ADsim(){
3441
init(window_size, pid, rid);
3542
}
3643
ADsim(){}
3744

3845
ADProvenanceDBclient &getProvDBclient(){ return *m_pdb_client; }
3946

40-
//Add a function execution on a specific thread
47+
/**
48+
* @brief Add a function execution on a specific thread
49+
* @param thread The thread index
50+
* @param func_name The name of the function
51+
* @param start The timestamp of the function start
52+
* @param runtime The function duration
53+
* @param is_anomaly Tag the function as anomalous
54+
* @param outlier_score If anomalous, provide a score
55+
*
56+
* The function end time (start + runtime) must be within the current IO step's time window
57+
*/
4158
CallListIterator_t addExec(const int thread,
4259
const std::string &func_name,
4360
unsigned long start,
4461
unsigned long runtime,
4562
bool is_anomaly,
4663
double outlier_score = 0.);
4764

48-
//Attach a counter to an execution t_delta us after the start of the execution
65+
/**
66+
* @brief Attach a counter to an execution t_delta us after the start of the execution
67+
* @param counter_name The counter name
68+
* @param value The counter value
69+
* @param t_delta The time after the start of the function execution at which the counter occurs
70+
* @param to The function execution
71+
*/
4972
void attachCounter(const std::string &counter_name,
5073
unsigned long value,
5174
long t_delta,
5275
CallListIterator_t to);
5376

54-
//Attach a communication event to an execution t_delta us after the start of the execution
55-
//partner_rank is the origin rank of a receive or the destination rank of a send
77+
/**
78+
* @brief Attach a communication event to an execution t_delta us after the start of the execution
79+
* @param type The type of communication
80+
* @param partner_rank The origin rank of a receive or the destination rank of a send
81+
* @param bytes The number of bytes communicated
82+
* @param t_delta The time after the start of the function execution at which the counter occurs
83+
* @param to The function execution
84+
*/
5685
void attachComm(CommType type,
5786
unsigned long partner_rank,
5887
unsigned long bytes,
5988
long t_delta,
6089
CallListIterator_t to);
6190

62-
//Register a thread index as corresponding to a GPU thread, allowing population of GPU information in provenance data
91+
/**
92+
* @brief Register a thread index as corresponding to a GPU thread, allowing population of GPU information in provenance data
93+
*/
6394
void registerGPUthread(const int tid);
6495

65-
//Register a GPU kernel event as originating from a cpu event
96+
/**
97+
* @brief Register a GPU kernel event as originating from a cpu event
98+
* @param cpu_parent The parent execution
99+
* @param gpu_kern The child GPU kernel execution
100+
*/
66101
void bindCPUparentGPUkernel(CallListIterator_t cpu_parent, CallListIterator_t gpu_kern);
67102

103+
/**
104+
* @brief Begin an IO step at the provided time
105+
*/
68106
void beginStep(const unsigned long step_start_time);
69107

108+
/**
109+
* @brief End the IO step at the provided time
110+
*
111+
* When this is called the executions added since the start of the step will be gathered and analyzed, and the data send to the pserver/provdb
112+
*/
70113
void endStep(const unsigned long step_end_time);
71114
};
72115

116+
/**
117+
* @brief Pretty print a function execution
118+
*/
73119
inline std::ostream & operator<<(std::ostream &os, const CallListIterator_t it){
74120
os << it->get_json(true, true).dump(4);
75121
return os;

sim/src/ad.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include<chimbuko/ad/ADLocalCounterStatistics.hpp>
44
#include<chimbuko/ad/ADNormalEventProvenance.hpp>
55
#include<chimbuko/ad/ADAnomalyProvenance.hpp>
6+
#include<chimbuko/util/error.hpp>
67

78
#include<sim/ad.hpp>
89
#include<sim/provdb.hpp>
@@ -23,6 +24,7 @@ void ADsim::init(int window_size, int pid, int rid){
2324
m_pdb_client->setEnableHandshake(false);
2425
m_pdb_client->connect(getProvDB().getAddr(), getProvDB().getNshards());
2526
m_counters.linkCounterMap(getCidxManager().getCounterMap());
27+
m_step_is_open = false;
2628
}
2729

2830
CallListIterator_t ADsim::addExec(const int thread,
@@ -31,6 +33,9 @@ CallListIterator_t ADsim::addExec(const int thread,
3133
unsigned long runtime,
3234
bool is_anomaly,
3335
double outlier_score){
36+
if(!m_step_is_open) fatal_error("No step is currently open");
37+
if(start + runtime < m_step_start_time) fatal_error("Function end time is not within the current step");
38+
3439
auto it = funcIdxMap().find(func_name);
3540
if(it == funcIdxMap().end()) assert(0);
3641
unsigned long func_id = it->second;
@@ -107,6 +112,7 @@ void ADsim::bindCPUparentGPUkernel(CallListIterator_t cpu_parent, CallListIterat
107112

108113
void ADsim::beginStep(const unsigned long step_start_time){
109114
m_step_start_time = step_start_time;
115+
m_step_is_open = true;
110116
}
111117

112118
void ADsim::endStep(const unsigned long step_end_time){
@@ -115,6 +121,8 @@ void ADsim::endStep(const unsigned long step_end_time){
115121

116122
int nanom=0, nnorm=0;
117123
for(const auto &exec_it : m_step_exec_its){
124+
if(exec_it->get_exit() > step_end_time) fatal_error("Event " + exec_it->get_json(true, true).dump(4) + " has an exit time later than the end of the current step, " + std::to_string(step_end_time) );
125+
118126
if(exec_it->get_label() == -1){ anom.insert(exec_it, Anomalies::EventType::Outlier); nanom++; }
119127
else if(anom.nFuncEvents(exec_it->get_fid(), Anomalies::EventType::Normal) == 0){ anom.insert(exec_it, Anomalies::EventType::Normal); nnorm++; }
120128
}
@@ -157,4 +165,5 @@ void ADsim::endStep(const unsigned long step_end_time){
157165
delete m_counters.flushCounters();
158166

159167
m_step++;
168+
m_step_is_open = false;
160169
}

0 commit comments

Comments
 (0)