Skip to content

Commit 0b31583

Browse files
committed
Added classes to gather anomaly metrics on the AD and pserver, to communicate between the two and with the viz + unit tests
1 parent 1c4954f commit 0b31583

20 files changed

Lines changed: 1096 additions & 18 deletions

include/Makefile.am

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
nobase_include_HEADERS = chimbuko/chimbuko.hpp chimbuko/ad/ADOutlier.hpp chimbuko/ad/ADNetClient.hpp chimbuko/ad/FuncStats.hpp chimbuko/ad/ADAnomalyProvenance.hpp chimbuko/ad/ADParser.hpp chimbuko/ad/ADProvenanceDBengine.hpp chimbuko/ad/ADNormalEventProvenance.hpp chimbuko/ad/ADLocalFuncStatistics.hpp chimbuko/ad/ADMetadataParser.hpp chimbuko/ad/AnomalyData.hpp chimbuko/ad/ADDefine.hpp chimbuko/ad/ADglobalFunctionIndexMap.hpp chimbuko/ad/ADProvenanceDBclient.hpp chimbuko/ad/ADEvent.hpp chimbuko/ad/ADCounter.hpp chimbuko/ad/ADLocalCounterStatistics.hpp chimbuko/ad/ExecData.hpp chimbuko/ad/utils.hpp chimbuko/ad/ADio.hpp chimbuko/verbose.hpp chimbuko/pserver/PSProvenanceDBclient.hpp chimbuko/pserver/GlobalAnomalyStats.hpp chimbuko/pserver/PSglobalFunctionIndexMap.hpp chimbuko/pserver/AggregateFuncStats.hpp chimbuko/pserver/AggregateAnomalyData.hpp chimbuko/pserver/PSstatSender.hpp chimbuko/pserver/GlobalCounterStats.hpp chimbuko/param/sstd_param.hpp chimbuko/param/hbos_param.hpp chimbuko/AD.hpp chimbuko/message.hpp chimbuko/net.hpp chimbuko/pserver.hpp chimbuko/param.hpp chimbuko/net/zmqme_net.hpp chimbuko/net/mpi_net.hpp chimbuko/net/zmq_net.hpp chimbuko/net/local_net.hpp chimbuko/util/RunStats.hpp chimbuko/util/ADIOS2parseUtils.hpp chimbuko/util/environment.hpp chimbuko/util/time.hpp chimbuko/util/map.hpp chimbuko/util/error.hpp chimbuko/util/string.hpp chimbuko/util/DispatchQueue.hpp chimbuko/util/PerfStats.hpp chimbuko/util/RunMetric.hpp chimbuko/util/memutils.hpp chimbuko/util/Anomalies.hpp chimbuko/util/barrier.hpp chimbuko/util/threadPool.hpp chimbuko/util/hash.hpp chimbuko/util/mtQueue.hpp chimbuko/util/serialize.hpp chimbuko/util/commandLineParser.hpp
1+
nobase_include_HEADERS = chimbuko/chimbuko.hpp chimbuko/ad/FuncAnomalyMetrics.hpp chimbuko/ad/ADOutlier.hpp chimbuko/ad/ADLocalAnomalyMetrics.hpp chimbuko/ad/ADNetClient.hpp chimbuko/ad/FuncStats.hpp chimbuko/ad/ADAnomalyProvenance.hpp chimbuko/ad/ADParser.hpp chimbuko/ad/ADProvenanceDBengine.hpp chimbuko/ad/ADNormalEventProvenance.hpp chimbuko/ad/ADLocalFuncStatistics.hpp chimbuko/ad/ADMetadataParser.hpp chimbuko/ad/AnomalyData.hpp chimbuko/ad/ADDefine.hpp chimbuko/ad/ADglobalFunctionIndexMap.hpp chimbuko/ad/ADProvenanceDBclient.hpp chimbuko/ad/ADcombinedPSdata.hpp chimbuko/ad/ADEvent.hpp chimbuko/ad/ADCounter.hpp chimbuko/ad/ADLocalCounterStatistics.hpp chimbuko/ad/ExecData.hpp chimbuko/ad/utils.hpp chimbuko/ad/ADio.hpp chimbuko/verbose.hpp chimbuko/pserver/PSProvenanceDBclient.hpp chimbuko/pserver/GlobalAnomalyStats.hpp chimbuko/pserver/AggregateFuncAnomalyMetrics.hpp chimbuko/pserver/PSglobalFunctionIndexMap.hpp chimbuko/pserver/AggregateFuncStats.hpp chimbuko/pserver/AggregateAnomalyData.hpp chimbuko/pserver/NetPayloadRecvCombinedADdata.hpp chimbuko/pserver/PSstatSender.hpp chimbuko/pserver/GlobalAnomalyMetrics.hpp chimbuko/pserver/GlobalCounterStats.hpp chimbuko/param/sstd_param.hpp chimbuko/param/hbos_param.hpp chimbuko/AD.hpp chimbuko/message.hpp chimbuko/net.hpp chimbuko/pserver.hpp chimbuko/param.hpp chimbuko/net/zmqme_net.hpp chimbuko/net/mpi_net.hpp chimbuko/net/zmq_net.hpp chimbuko/net/local_net.hpp chimbuko/util/RunStats.hpp chimbuko/util/ADIOS2parseUtils.hpp chimbuko/util/environment.hpp chimbuko/util/time.hpp chimbuko/util/map.hpp chimbuko/util/error.hpp chimbuko/util/string.hpp chimbuko/util/DispatchQueue.hpp chimbuko/util/PerfStats.hpp chimbuko/util/RunMetric.hpp chimbuko/util/memutils.hpp chimbuko/util/Anomalies.hpp chimbuko/util/barrier.hpp chimbuko/util/threadPool.hpp chimbuko/util/hash.hpp chimbuko/util/mtQueue.hpp chimbuko/util/serialize.hpp chimbuko/util/commandLineParser.hpp
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
#pragma once
2+
#include <chimbuko_config.h>
3+
#include <chimbuko/ad/ADNetClient.hpp>
4+
#include <chimbuko/util/Anomalies.hpp>
5+
#include <chimbuko/ad/FuncAnomalyMetrics.hpp>
6+
7+
namespace chimbuko{
8+
9+
/**
10+
* @brief A class that collects anomaly metrics broken down by function for sending to the pserver
11+
*/
12+
class ADLocalAnomalyMetrics{
13+
public:
14+
/**
15+
* @brief Class state that is serialized for pserver comms
16+
*/
17+
struct State{
18+
int app; /**< Program idx*/
19+
int rank; /**< rank */
20+
int step; /**< io step */
21+
unsigned long first_event_ts; /**< Timestamp of first event on step */
22+
unsigned long last_event_ts; /**< Timestamp of last event on step */
23+
24+
std::unordered_map<int, FuncAnomalyMetrics::State> func_anom_metrics;
25+
26+
/*
27+
* @brief Serialize this instance in Cereal
28+
*/
29+
template<class Archive>
30+
void serialize(Archive & archive){
31+
archive(app, rank, step, first_event_ts, last_event_ts, func_anom_metrics);
32+
}
33+
34+
State(const ADLocalAnomalyMetrics &parent);
35+
State(){}
36+
};
37+
38+
/**
39+
* @brief Constructor
40+
* @param app Application index
41+
* @param rank AD rank
42+
* @param step IO step
43+
* @param first_event_ts Timestamp of the first event on this step
44+
* @param last_event_ts Timestamp of the last event on this step
45+
* @param anom Anomalies instance
46+
*/
47+
ADLocalAnomalyMetrics(int app, int rank, int step, unsigned long first_event_ts, unsigned long last_event_ts, const Anomalies &anom);
48+
ADLocalAnomalyMetrics(){}
49+
50+
/**
51+
* @brief Get the current state as a state object
52+
*
53+
* The string dump of this object is the serialized form sent to the parameter server
54+
*/
55+
State get_state() const;
56+
57+
58+
/**
59+
* @brief Set the internal variables to the given state object
60+
*/
61+
void set_state(const State &s);
62+
63+
64+
/**
65+
* @brief Serialize this class for communication over the network
66+
*/
67+
std::string net_serialize() const;
68+
69+
/**
70+
* @brief Unserialize this class after communication over the network
71+
*/
72+
void net_deserialize(const std::string &s);
73+
74+
75+
/**
76+
* @brief Send the data to the pserver
77+
* @param net_client The network client object
78+
* @return std::pair<size_t, size_t> [sent, recv] message size
79+
*/
80+
std::pair<size_t, size_t> send(ADNetClient &client) const;
81+
82+
83+
/**
84+
* @brief Get the data
85+
*/
86+
const std::unordered_map<int, FuncAnomalyMetrics> & get_metrics() const{ return m_func_anom_metrics; }
87+
88+
/**
89+
* @brief Get the program idx
90+
*/
91+
int get_pid() const{ return m_app; }
92+
93+
/**
94+
* @brief Get the rank
95+
*/
96+
int get_rid() const{ return m_rank; }
97+
98+
/**
99+
* @brief Get the IO step
100+
*/
101+
int get_step() const{ return m_step; }
102+
103+
/**
104+
* @brief Get the timestamp of the first event on this IO step
105+
*/
106+
unsigned long get_first_event_ts() const{ return m_first_event_ts; }
107+
108+
/**
109+
* @brief Get the timestamp of the last event on this IO step
110+
*/
111+
unsigned long get_last_event_ts() const{ return m_last_event_ts; }
112+
113+
114+
/**
115+
* @brief Equivalence operator
116+
*/
117+
bool operator==(const ADLocalAnomalyMetrics &r) const{
118+
return m_app == r.m_app && m_rank == r.m_rank && m_step == r.m_step && m_func_anom_metrics == r.m_func_anom_metrics &&
119+
m_first_event_ts == r.m_first_event_ts && m_last_event_ts == r.m_last_event_ts;
120+
}
121+
122+
/**
123+
* @brief Inequality operator
124+
*/
125+
inline bool operator!=(const ADLocalAnomalyMetrics &r) const{ return !(*this == r); }
126+
127+
/**
128+
* @brief Attach a RunMetric object into which performance metrics are accumulated
129+
*/
130+
void linkPerf(PerfStats* perf){ m_perf = perf; }
131+
132+
private:
133+
int m_app; /**< Program idx*/
134+
int m_rank; /**< rank */
135+
int m_step; /**< io step */
136+
unsigned long m_first_event_ts; /**< Timestamp of first event on step */
137+
unsigned long m_last_event_ts; /**< Timestamp of last event on step */
138+
139+
std::unordered_map<int, FuncAnomalyMetrics> m_func_anom_metrics; /**< Map of function idx to the metrics for that function*/
140+
141+
PerfStats *m_perf; /**< Store performance data */
142+
};
143+
144+
145+
146+
147+
}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
#pragma once
2+
#include <chimbuko_config.h>
3+
#include <chimbuko/ad/ADNetClient.hpp>
4+
#include <chimbuko/ad/ExecData.hpp>
5+
#include <chimbuko/util/RunStats.hpp>
6+
7+
namespace chimbuko{
8+
9+
/**
10+
* @brief Aggregated anomaly metrics for a single function
11+
*/
12+
class FuncAnomalyMetrics{
13+
public:
14+
15+
/**
16+
* @brief Class state that is serialized for pserver comms
17+
*/
18+
struct State{
19+
RunStats::State score;
20+
RunStats::State severity;
21+
int count;
22+
int fid;
23+
std::string func;
24+
25+
State(const FuncAnomalyMetrics &parent): score(parent.m_score.get_state()), severity(parent.m_severity.get_state()), count(parent.m_count), fid(parent.m_fid), func(parent.m_func){}
26+
State(): count(0){}
27+
28+
/*
29+
* @brief Serialize this instance in Cereal
30+
*/
31+
template<class Archive>
32+
void serialize(Archive & archive){
33+
archive(score,severity,count,fid,func);
34+
}
35+
};
36+
37+
FuncAnomalyMetrics(): m_count(0), m_score(true), m_severity(true), m_fid(-1){}
38+
FuncAnomalyMetrics(const int fid, const std::string &func): m_fid(fid), m_func(func), m_count(0), m_score(true), m_severity(true){}
39+
40+
/**
41+
* @brief Get the current state as a state object
42+
*
43+
* The string dump of this object is the serialized form sent to the parameter server
44+
*/
45+
State get_state() const;
46+
47+
48+
/**
49+
* @brief Set the internal variables to the given state object
50+
*/
51+
void set_state(const State &s);
52+
53+
54+
/**
55+
* @brief Add data associated with an anomalous event
56+
*/
57+
void add(const ExecData_t &event);
58+
59+
60+
/**
61+
* @brief Equivalence operator
62+
*/
63+
bool operator==(const FuncAnomalyMetrics &r) const{
64+
return m_score == r.m_score && m_severity == r.m_severity && m_count == r.m_count && m_fid == r.m_fid && m_func == r.m_func;
65+
}
66+
67+
/**
68+
* @brief Inequality operator
69+
*/
70+
inline bool operator!=(const FuncAnomalyMetrics &r) const{ return !(*this == r); }
71+
72+
const RunStats &get_score() const{ return m_score; }
73+
const RunStats &get_severity() const{ return m_severity; }
74+
int get_count() const{ return m_count; }
75+
int get_fid() const{ return m_fid; }
76+
const std::string & get_func() const{ return m_func; }
77+
78+
private:
79+
RunStats m_score; /**< Anomaly scores */
80+
RunStats m_severity; /**< Anomaly severity */
81+
int m_count; /**< The number of anomalies */
82+
int m_fid; /**< Function index */
83+
std::string m_func; /**< Function name */
84+
85+
};
86+
87+
};

include/chimbuko/message.hpp

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ enum MessageKind {
3232
ANOMALY_STATS = 3,
3333
COUNTER_STATS = 4,
3434
FUNCTION_INDEX = 5,
35-
AD_PS_COMBINED_STATS = 6
35+
ANOMALY_METRICS = 6,
36+
AD_PS_COMBINED_STATS = 7
3637
};
3738

3839
enum MessageCmd {
@@ -196,19 +197,7 @@ class Message {
196197
/**
197198
* @brief Get the message kind in string form
198199
*/
199-
std::string kind_str() const {
200-
switch(m_head.kind())
201-
{
202-
case 0: return "DEFAULT";
203-
case 1: return "CMD";
204-
case 2: return "PARAMETERS";
205-
case 3: return "ANOMALY_STATS";
206-
case 4: return "COUNTER_STATS";
207-
case 5: return "FUNCTION_INDEX";
208-
case 6: return "AD_PS_COMBINED_STATS";
209-
default: return "UNKNOWN";
210-
}
211-
}
200+
std::string kind_str() const;
212201

213202
/**
214203
* @brief Get the message size in bytes
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
#pragma once
2+
#include <chimbuko_config.h>
3+
#include <chimbuko/ad/FuncAnomalyMetrics.hpp>
4+
5+
namespace chimbuko {
6+
7+
/**< An object holding aggregated anomaly metrics on a function for a specific rank*/
8+
struct AggregateFuncAnomalyMetrics{
9+
public:
10+
/**
11+
* @brief Constructor
12+
* @param pid Program idx
13+
* @param rid Rank idx
14+
* @param fid Function idx
15+
* @param func Function name
16+
*/
17+
AggregateFuncAnomalyMetrics(int pid, int rid, int fid, const std::string &func);
18+
19+
/**
20+
* @brief Add more data into the accumulated totals
21+
* @param metrics Metrics to include
22+
* @param io_step The IO step in which the data are associated
23+
* @param first_event_ts The timestamp of the first event in the IO step
24+
* @param last_event_ts The timestamp of the first event in the IO step
25+
*/
26+
void add(const FuncAnomalyMetrics &metrics, unsigned long io_step, unsigned long first_event_ts, unsigned long last_event_ts);
27+
28+
/**
29+
* @brief Create a JSON object from this instance
30+
*/
31+
nlohmann::json get_json() const;
32+
33+
/**
34+
* @brief Get the program idx
35+
*/
36+
const int get_pid() const{ return m_pid; }
37+
38+
/**
39+
* @brief Get the rank
40+
*/
41+
const int get_rid() const{ return m_rid; }
42+
43+
/**
44+
* @brief Get the function idx
45+
*/
46+
const int get_fid() const{ return m_fid; }
47+
/**
48+
* @brief Get the function name
49+
*/
50+
const std::string &get_func() const{ return m_func; }
51+
/**
52+
* @brief Get the statistics on the number of anomalies per io step
53+
*/
54+
const RunStats & get_count_stats() const{ return m_count; }
55+
/**
56+
* @brief Get the statistics on the anomaly scores
57+
*/
58+
const RunStats & get_score_stats() const{ return m_score; }
59+
/**
60+
* @brief Get the statistics on the anomaly scores
61+
*/
62+
const RunStats & get_severity_stats() const{ return m_severity; }
63+
64+
/**
65+
* @brief Get the timestamp of the first event included
66+
*/
67+
unsigned long get_first_event_ts() const{ return m_first_event_ts; }
68+
69+
/**
70+
* @brief Get the timestamp of the last event included
71+
*/
72+
unsigned long get_last_event_ts() const{ return m_last_event_ts; }
73+
74+
/**
75+
* @brief Get the first IO step included
76+
*/
77+
unsigned long get_first_io_step() const{ return m_first_io_step; }
78+
79+
/**
80+
* @brief Get the last IO step included
81+
*/
82+
unsigned long get_last_io_step() const{ return m_last_io_step; }
83+
84+
85+
/**
86+
* @brief Comparison operator
87+
*/
88+
bool operator==(const AggregateFuncAnomalyMetrics &r) const{ return m_pid == r.m_pid && m_rid == r.m_rid && m_fid == r.m_fid && m_func == r.m_func &&
89+
m_count == r.m_count && m_score == r.m_score && m_severity == r.m_severity &&
90+
m_first_event_ts == r.m_first_event_ts && m_last_event_ts == r.m_last_event_ts &&
91+
m_first_io_step == r.m_first_io_step && m_last_io_step == r.m_last_io_step;
92+
}
93+
/**
94+
* @brief Inequality operator
95+
*/
96+
bool operator!=(const AggregateFuncAnomalyMetrics &r) const{ return !( *this == r ); }
97+
98+
99+
private:
100+
int m_pid; /**< Program idx */
101+
int m_rid; /**< Rank idx */
102+
int m_fid; /**< Function idx */
103+
std::string m_func; /**< Func name */
104+
RunStats m_count; /**< Statistics on the number of anomalies on this function/rank per anomaly step*/
105+
RunStats m_score; /**< Statistics on anomaly scores accumulated over steps*/
106+
RunStats m_severity; /**< Statistics on anomaly severity accumulated over steps*/
107+
unsigned long m_first_event_ts; /**< Timestamp of first event in the aggregated data */
108+
unsigned long m_last_event_ts; /**< Timestamp of last event in the aggregated data */
109+
unsigned long m_first_io_step; /**< First IO step in the aggregated data */
110+
unsigned long m_last_io_step; /**< Last IO step in the aggregated data */
111+
};
112+
113+
114+
115+
}

0 commit comments

Comments
 (0)