@@ -42,68 +42,44 @@ const AggregateAnomalyData & GlobalAnomalyStats::get_anomaly_stat_container(cons
4242}
4343
4444RunStats GlobalAnomalyStats::get_anomaly_stat_obj (const int pid, const unsigned long rid) const {
45+ std::lock_guard<std::mutex> _ (m_mutex_anom);
4546 return get_anomaly_stat_container (pid, rid).get_stats ();
4647}
4748
4849std::string GlobalAnomalyStats::get_anomaly_stat (const int pid, const unsigned long rid) const {
49- RunStats stat;
50- try {
51- stat = get_anomaly_stat_obj (pid,rid);
52- }catch (const std::exception &e){
53- return " " ;
54- }
55- return stat.get_json ().dump ();
50+ std::lock_guard<std::mutex> _ (m_mutex_anom);
51+ auto pit = m_anomaly_stats.find (pid);
52+ if (pit == m_anomaly_stats.end ()) return " " ;
53+ auto rit = pit->second .find (rid);
54+ if (rit == pit->second .end ()) return " " ;
55+ return rit->second .get_stats ().get_json ().dump ();
5656}
5757
5858size_t GlobalAnomalyStats::get_n_anomaly_data (const int pid, const unsigned long rid) const {
59- AggregateAnomalyData const * s;
60- try {
61- s = &get_anomaly_stat_container (pid,rid);
62- }catch (const std::exception &e){
63- return 0 ;
64- }
65- return s->get_n_data ();
59+ std::lock_guard<std::mutex> _ (m_mutex_anom);
60+ auto pit = m_anomaly_stats.find (pid);
61+ if (pit == m_anomaly_stats.end ()) return 0 ;
62+ auto rit = pit->second .find (rid);
63+ if (rit == pit->second .end ()) return 0 ;
64+ return rit->second .get_n_data ();
6665}
6766
6867nlohmann::json GlobalAnomalyStats::collect_stat_data (){
6968 nlohmann::json jsonObjects = nlohmann::json::array ();
70-
71- // m_anomaly_stats is a map of app_idx/rank to AggregateAnomalyData instances
72- // AggregateAnomalyData contains statistics on the number of anomalies found per io step and also a set of AnomalyData objects
73- // that have been collected from that rank since the last flush
74- for (auto & pp : m_anomaly_stats){
75- int pid = pp.first ; // pid
76- for (auto & rp: pp.second ){
77- unsigned long rid = rp.first ; // rank
78-
79- auto stats = rp.second .get (); // returns a std::pair<RunStats, std::list<AnomalyData>*>, and flushes the state of pair.second.
80- // We now own the std::list<AnomalyData>* pointer and have to delete it
81-
82- if (stats.second ){
83- // Decide whether to include the data for this pid/rid
84- // Do this only if any anomalies were seen since the last call
85- bool include = false ;
86- for (const AnomalyData &adata: *stats.second ){
87- if (adata.get_n_anomalies () > 0 ){
88- include = true ;
89- break ;
90- }
91- }
92-
93- if (include){
94- nlohmann::json object;
95- object[" key" ] = stringize (" %d:%d" , pid,rid);
96- object[" stats" ] = stats.first .get_json (); // statistics on anomalies to date for this pid/rid
97-
98- object[" data" ] = nlohmann::json::array ();
99- for (const AnomalyData &adata: *stats.second ){
100- // Don't include data for which there are no anomalies
101- if (adata.get_n_anomalies ()>0 )
102- object[" data" ].push_back (adata.get_json ());
103- }
104- jsonObjects.push_back (object);
105- }
106- delete stats.second ;
69+ {
70+ std::lock_guard<std::mutex> _ (m_mutex_anom);
71+
72+ // m_anomaly_stats is a map of app_idx/rank to AggregateAnomalyData instances
73+ // AggregateAnomalyData contains statistics on the number of anomalies found per io step and also a set of AnomalyData objects
74+ // that have been collected from that rank since the last flush
75+ for (auto & pp : m_anomaly_stats){
76+ int pid = pp.first ; // pid
77+ for (auto & rp: pp.second ){
78+ unsigned long rid = rp.first ; // rank
79+
80+ nlohmann::json object = rp.second .get_json_and_flush (pid,rid);
81+ if (!object.empty ())
82+ jsonObjects.push_back (std::move (object));
10783 }
10884 }
10985 }
@@ -153,7 +129,6 @@ void GlobalAnomalyStats::update_func_stat(int pid, unsigned long fid, const std:
153129}
154130
155131const AggregateFuncStats & GlobalAnomalyStats::get_func_stats (int pid, unsigned long fid) const {
156- std::lock_guard<std::mutex> _ (m_mutex_func);
157132 auto pit = m_funcstats.find (pid);
158133 if (pit == m_funcstats.end ()) fatal_error (" Could not find program index" );
159134 auto fit = pit->second .find (fid);
0 commit comments