Skip to content

Commit e2fb4b0

Browse files
committed
Fixed a bug where empty data sets in ExecData_map would lead to ADLocalFuncStatistics sending uninitialized function stats data objects to the pserver and hence to the global database
1 parent d076400 commit e2fb4b0

8 files changed

Lines changed: 59 additions & 4 deletions

File tree

include/chimbuko/modules/performance_analysis/ad/ADExecDataInterface.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ namespace chimbuko {
5656
/**
5757
* @brief Return the function index associated with a given data set
5858
*/
59-
size_t getDataSetModelIndex(size_t dset_index) const override{ return m_dset_fid_map[dset_index]; }
59+
size_t getDataSetModelIndex(size_t dset_index) const override;
6060

6161
/**
6262
* @brief Return the data set index associated with a given function index

src/core/chimbuko.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ bool ChimbukoBase::runFrame(){
335335
#endif
336336

337337
int nout = iface->nEventsRecorded(ADDataInterface::EventType::Outlier);
338-
int nnormal = iface->nEvents() - nout; //this is the total number of normal events, not just of those that were recorded
338+
int nnormal = iface->nEvents() - nout; //this is the total number of normal events (also unlabeled events), not just of those that were recorded
339339
m_run_stats.n_outliers += nout;
340340
m_accum_prd.n_outliers += nout;
341341

src/modules/performance_analysis/ad/ADAnomalyProvenance.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,8 @@ void ADAnomalyProvenance::getProvenanceEntries(std::vector<nlohmann::json> &anom
236236
for(auto const &e : iface.getResults(dset_idx).getEventsRecorded(ADDataInterface::EventType::Outlier)){
237237
timer2.start();
238238
auto anom_it = iface.getExecDataEntry(dset_idx, e.index);
239+
240+
//verboseStream << "Collecting provenance for anomaly on dset " << dset_idx << " index " << e.index << " with content:\n" << anom_it->get_json().dump(2) << std::endl;
239241

240242
if(anom_it->get_exclusive() < m_min_anom_time) continue; //skip executions with too short runtimes to avoid filling the database with irrelevant anomalies
241243

src/modules/performance_analysis/ad/ADExecDataInterface.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ ADExecDataInterface::ADExecDataInterface(ExecDataMap_t const* execDataMap, Outli
99
size_t dset_idx = 0;
1010
for(auto it = execDataMap->begin(); it != execDataMap->end(); ++it)
1111
m_dset_fid_map[dset_idx++] = it->first;
12+
13+
if(enableVerboseLogging()){
14+
std::cout << "ADExecDataInterface created with #datasets=" << this->nDataSets() << " (exec-data map size " << execDataMap->size() << ")" << " with dset_idx:fid mapping ";
15+
for(int d=0;d<this->nDataSets();d++) std::cout << d << ":" << m_dset_fid_map[d] << " ";
16+
std::cout << std::endl;
17+
}
1218
}
1319

1420
double ADExecDataInterface::getStatisticValue(const ExecData_t &e) const{
@@ -30,6 +36,14 @@ void ADExecDataInterface::setIgnoreFunction(const std::string &func){
3036
m_func_ignore.insert(func);
3137
}
3238

39+
size_t ADExecDataInterface::getDataSetModelIndex(size_t dset_index) const{
40+
if(dset_index >= m_dset_fid_map.size()){
41+
std::string err = "Invalid dset_index " +std::to_string(dset_index);
42+
fatal_error(err);
43+
}
44+
return m_dset_fid_map[dset_index];
45+
}
46+
3347
size_t ADExecDataInterface::getDataSetIndexOfFunction(size_t fid) const{
3448
for(size_t dset_idx = 0; dset_idx < m_dset_fid_map.size(); dset_idx++)
3549
if(m_dset_fid_map[dset_idx] == fid) return dset_idx;
@@ -39,6 +53,7 @@ size_t ADExecDataInterface::getDataSetIndexOfFunction(size_t fid) const{
3953
CallListIterator_t ADExecDataInterface::getExecDataEntry(size_t dset_index, size_t elem_index) const{
4054
auto it = m_execDataMap->find(m_dset_fid_map[dset_index]);
4155
if(it == m_execDataMap->end()){ fatal_error("Invalid dset_idx"); }
56+
if(elem_index >= it->second.size()){ fatal_error("Invalid elem_index"); }
4257
return it->second[elem_index];
4358
}
4459

@@ -78,6 +93,7 @@ void ADExecDataInterface::recordDataSetLabelsInternal(const std::vector<Elem> &d
7893
auto it = m_execDataMap->find(m_dset_fid_map[dset_index]);
7994
if(it == m_execDataMap->end()){ fatal_error("Invalid dset_idx"); }
8095
for(auto const &e: data){
96+
if(e.index >= it->second.size()){ fatal_error("Invalid element index"); }
8197
CallListIterator_t eint = it->second[e.index];
8298
eint->set_outlier_score(e.score);
8399
if(e.label != EventType::Unassigned){

src/modules/performance_analysis/ad/ADLocalFuncStatistics.cpp

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <chimbuko/modules/performance_analysis/ad/AnomalyData.hpp>
33
#include <chimbuko/modules/performance_analysis/pserver/PScommon.hpp>
44
#include <chimbuko/core/util/serialize.hpp>
5+
#include <chimbuko/core/verbose.hpp>
56
#include <climits>
67
#include <algorithm>
78

@@ -23,7 +24,7 @@ void ADLocalFuncStatistics::gatherStatistics(const ExecDataMap_t* exec_data){
2324
//Create new entry if it doesn't exist
2425
if(fstats_it == m_funcstats.end()){
2526
const std::string &name = it.second.front()->get_funcname(); //it.second has already been checked to have size >= 1
26-
fstats_it = m_funcstats.insert( std::unordered_map<unsigned long, FuncStats>::value_type(func_id, FuncStats(m_anom_data.get_app(), func_id, name)) ).first;
27+
fstats_it = m_funcstats.insert( std::unordered_map<unsigned long, FuncStats>::value_type(func_id, FuncStats(m_anom_data.get_app(), func_id, name)) ).first;
2728
}
2829

2930
for (auto itt : it.second) { //loop over events for that function
@@ -33,6 +34,8 @@ void ADLocalFuncStatistics::gatherStatistics(const ExecDataMap_t* exec_data){
3334
min_ts = std::min(min_ts, static_cast<unsigned long>(itt->get_entry()) );
3435
max_ts = std::max(max_ts, static_cast<unsigned long>(itt->get_exit()));
3536
}
37+
38+
verboseStream << "ADLocalFuncStatistics::gatherStatistics generated stats for func_id " << func_id << ":\n" << fstats_it->second.get_json().dump(2) << std::endl;
3639
}
3740

3841
m_anom_data.set_min_ts(min_ts);
@@ -42,13 +45,29 @@ void ADLocalFuncStatistics::gatherStatistics(const ExecDataMap_t* exec_data){
4245
void ADLocalFuncStatistics::gatherAnomalies(const ADExecDataInterface &iface){
4346
//Gather information on the number of anomalies and stats on their scores
4447
size_t nanom_tot = 0;
48+
verboseStream << "ADLocalFuncStatistics::gatherAnomalies processing anomalies for " << iface.nDataSets() << std::endl;
49+
4550
for(size_t dset_idx =0 ; dset_idx < iface.nDataSets(); dset_idx++){
4651
size_t fid = iface.getDataSetModelIndex(dset_idx);
4752
auto const & r = iface.getResults(dset_idx);
53+
54+
//Because some data sets may be empty or contain events that don't get labeled this step we should skip those data sets
55+
//This is particularly important for empty data sets as these do not have entries in m_funcstats, hence trying to push anomaly counts (even 0) will lead to uninitialized entries
56+
if(r.nEventsRecorded(ADDataInterface::EventType::Outlier) == 0 && r.nEventsRecorded(ADDataInterface::EventType::Normal) == 0)
57+
continue;
58+
4859
auto const &anom = r.getEventsRecorded(ADDataInterface::EventType::Outlier);
4960
size_t nanom = anom.size();
5061
nanom_tot += nanom;
51-
m_funcstats[fid].n_anomaly += nanom; //increment func anomalies count
62+
63+
verboseStream << "ADLocalFuncStatistics::gatherAnomalies for dset_idx=" << dset_idx << " and fid=" << fid << " with " << nanom << " anomalies" << std::endl;
64+
65+
auto fit = m_funcstats.find(fid);
66+
if(fit == m_funcstats.end()){
67+
std::stringstream ss; ss << "ADLocalFuncStatistics::gatherAnomalies cannot find funcstats entry for function index " << fid << " with " << nanom << " anomalies";
68+
fatal_error(ss.str());
69+
}
70+
fit->second.n_anomaly += nanom; //increment func anomalies count
5271
for(auto const &e : anom) m_anom_data.add_outlier_score(e.score);
5372
}
5473
m_anom_data.incr_n_anomalies(nanom_tot);
@@ -64,6 +83,7 @@ nlohmann::json ADLocalFuncStatistics::get_json() const{
6483
g_info["func"].push_back(e.second.get_json());
6584
}
6685
g_info["anomaly"] = m_anom_data.get_json();
86+
6787
return g_info;
6888
}
6989

src/modules/performance_analysis/ad/ADcombinedPSdata.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include<chimbuko/modules/performance_analysis/ad/ADcombinedPSdata.hpp>
22
#include <chimbuko/modules/performance_analysis/pserver/PScommon.hpp>
33
#include<chimbuko/core/util/serialize.hpp>
4+
#include<chimbuko/core/verbose.hpp>
45

56
using namespace chimbuko;
67
using namespace chimbuko::modules::performance_analysis;
@@ -87,6 +88,13 @@ std::pair<size_t, size_t> ADcombinedPSdataArray::send(ADNetClient &net_client) c
8788
latest_step = std::max(step, latest_step);
8889
}
8990

91+
if(enableVerboseLogging()){
92+
for(ADLocalFuncStatistics const &f : m_func_stats){
93+
std::cout << "ADcombinedPSdataArray::send sending func stats\n" << f.get_json().dump(2) << std::endl;
94+
}
95+
96+
}
97+
9098
Message msg;
9199
msg.set_info(net_client.get_client_rank(), net_client.get_server_rank(), MessageType::REQ_ADD, MessageKind::AD_PS_COMBINED_STATS, latest_step);
92100
msg.setContent(net_serialize());

src/modules/performance_analysis/chimbuko.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,10 @@ void Chimbuko::bufferStorePSdata(const ADDataInterface &anomalies, const int st
358358
ADLocalFuncStatistics func_stats(m_program_idx, rank, step, &perf);
359359
func_stats.gatherStatistics(m_event->getExecDataMap());
360360
func_stats.gatherAnomalies(ei);
361+
362+
verboseStream << "Chimbuko::bufferStorePSdata func_stats has " << func_stats.getFuncStats().size() << " entries" << std::endl;
363+
verboseStream << "Chimbuko::bufferStorePSdata generated func_stats " << func_stats.get_json().dump(2) << std::endl;
364+
361365
m_funcstats_buf.emplace_back(std::move(func_stats));
362366
perf.add("ad_gather_ps_data_gather_profile_stats_time_ms", timer.elapsed_ms());
363367

@@ -374,6 +378,8 @@ void Chimbuko::bufferStorePSdata(const ADDataInterface &anomalies, const int st
374378
m_anom_metrics_buf.emplace_back(std::move(metrics));
375379
perf.add("ad_gather_ps_data_gather_metrics_time_ms", timer.elapsed_ms());
376380
perf.add("ad_gather_ps_data_total_time_ms", timer.elapsed_ms());
381+
382+
verboseStream << "Chimbuko::bufferStorePSdata buffered func_stats " << m_funcstats_buf.back().get_json().dump(2) << std::endl;
377383
}
378384
}
379385

src/modules/performance_analysis/pserver/GlobalAnomalyStats.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ void GlobalAnomalyStats::add_anomaly_data(const ADLocalFuncStatistics& data){
7272

7373
for (auto const &fp: data.getFuncStats()) {
7474
const FuncStats &f = fp.second;
75+
76+
verboseStream << "GlobalAnomalyStats adding FuncStats " << f.get_json().dump(2) << std::endl;
77+
7578
update_func_stat(f.pid, f.id, f.name, f.n_anomaly, f.inclusive, f.exclusive);
7679
}
7780
}

0 commit comments

Comments
 (0)