Skip to content

Commit 3361b0d

Browse files
committed
Reworked handling of fatal errors:
Such errors are no longer written immediately to error stream but instead simply throw. Only uncaught exceptions are printed to the error stream through a new terminate handler. The terminate handler also closes the error stream file (if present) so that fatal errors will be written to disk. Chimbuko now gracefully handles instances where a counter event is detected without a corresponding counter name string in the map and no longer aborts
1 parent 1da6f55 commit 3361b0d

6 files changed

Lines changed: 105 additions & 34 deletions

File tree

app/driver.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
// #include "chimbuko/AD.hpp"
21
#include "chimbuko/chimbuko.hpp"
32
#include "chimbuko/verbose.hpp"
43
#include "chimbuko/util/string.hpp"

include/chimbuko/util/error.hpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,16 @@ namespace chimbuko{
2828
void recoverable(const std::string &msg, const std::string &func, const std::string &file, const unsigned long line);
2929

3030
/**
31-
* @brief Signal a fatal error
31+
* @brief Signal a fatal error. Not written to output stream unless either uncaught (which will also call abort) or manually flushed
3232
*/
3333
void fatal(const std::string &msg, const std::string &func, const std::string &file, const unsigned long line);
3434

35+
/**
36+
* @brief Manually flush an exception to the output stream
37+
*/
38+
void flushError(std::exception_ptr e);
39+
void flushError(const std::exception &e);
40+
3541
private:
3642
std::string getErrStr(const std::string &type, const std::string &msg, const std::string &func, const std::string &file, const unsigned long line) const;
3743

@@ -40,7 +46,17 @@ namespace chimbuko{
4046
std::mutex m_mutex;
4147
};
4248

43-
extern ErrorWriter g_error; /**< Global instance of ErrorWriter*/
49+
/**
50+
* @brief The global error writer instance
51+
*/
52+
ErrorWriter & Error();
53+
54+
/**
55+
* @brief For fatal errors we delay writing the error to the output stream in case it is caught. This terminate handler ensures it is written
56+
*
57+
* After flushing the error the handler calls the terminateHandlerAbortAction above
58+
*/
59+
void writeErrorTerminateHandler();
4460

4561
/**
4662
* @brief Set the error output of the global error writer to a stream and specify the rank
@@ -57,12 +73,12 @@ namespace chimbuko{
5773
* @brief Signal a recoverable error
5874
*/
5975
#define recoverable_error(MSG) \
60-
{ g_error.recoverable(MSG, __func__, __FILE__, __LINE__); }
76+
{ chimbuko::Error().recoverable(MSG, __func__, __FILE__, __LINE__); }
6177

6278
/**
6379
* @brief Signal a fatal error
6480
*/
6581
#define fatal_error(MSG) \
66-
{ g_error.fatal(MSG, __func__, __FILE__, __LINE__); }
82+
{ chimbuko::Error().fatal(MSG, __func__, __FILE__, __LINE__); }
6783

6884
}

src/chimbuko.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,11 @@ void Chimbuko::extractCounters(int rank, int step){
367367
EventDataType::COUNT,
368368
c,
369369
eventID(rank, step, c));
370-
m_counter->addCounter(ev);
370+
try{
371+
m_counter->addCounter(ev);
372+
}catch(const std::exception &e){
373+
recoverable_error(std::string("extractCounters failed to register counter event :\"") + ev.get_json().dump() + "\"");
374+
}
371375
}
372376
m_perf.add("ad_extract_counters_get_register_ms", timer.elapsed_ms());
373377
}

src/util/error.cpp

Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
#include <iostream>
66
#include <sstream>
77
#include <fstream>
8-
8+
#include <exception>
9+
#include <memory>
910

1011
namespace chimbuko{
1112

@@ -17,8 +18,6 @@ namespace chimbuko{
1718
}
1819
void ErrorWriter::fatal(const std::string &msg, const std::string &func, const std::string &file, const unsigned long line){
1920
std::string err = getErrStr("FATAL", msg, func, file, line);
20-
std::lock_guard<std::mutex> _(m_mutex);
21-
*m_ostream << err;
2221
throw std::runtime_error(err);
2322
}
2423

@@ -31,21 +30,57 @@ namespace chimbuko{
3130
return ss.str();
3231
}
3332

34-
ErrorWriter g_error;
33+
void ErrorWriter::flushError(std::exception_ptr e){
34+
if(e){
35+
try{ //this is necessary to get the exception type
36+
rethrow_exception(e);
37+
}catch(const std::exception &ev){
38+
flushError(ev);
39+
}
40+
}
41+
}
42+
void ErrorWriter::flushError(const std::exception &e){
43+
std::lock_guard<std::mutex> _(m_mutex);
44+
(*m_ostream) << e.what();
45+
}
46+
47+
ErrorWriter & Error(){
48+
static bool first = true;
49+
static ErrorWriter e;
50+
51+
//Register the terminate handler
52+
if(first){
53+
std::set_terminate(writeErrorTerminateHandler);
54+
first = false;
55+
}
56+
return e;
57+
}
3558

3659
void set_error_output_stream(const int rank, std::ostream *strm){
37-
g_error.setRank(rank);
38-
g_error.setStream(strm);
60+
Error().setRank(rank);
61+
Error().setStream(strm);
3962
}
4063

64+
std::unique_ptr<std::ofstream> & error_output_file(){ static std::unique_ptr<std::ofstream> u; return u; }
65+
4166
void set_error_output_file(const int rank, const std::string &file_stub){
4267
std::string filename = stringize("%s.%d",file_stub.c_str(),rank);
43-
static std::ofstream f(filename);
44-
if(f.fail()) throw std::runtime_error("set_error_output_file could not open file " + filename);
45-
g_error.setRank(rank);
46-
g_error.setStream(&f);
68+
error_output_file().reset(new std::ofstream(filename));
69+
if(error_output_file()->fail()) throw std::runtime_error("set_error_output_file could not open file " + filename);
70+
Error().setRank(rank);
71+
Error().setStream(error_output_file().get());
4772
}
4873

49-
74+
void writeErrorTerminateHandler(){
75+
if( auto exc = std::current_exception() ) {
76+
Error().flushError(exc);
77+
}
78+
79+
//Make sure to close the error output file first because abort does not destruct static objects
80+
if(error_output_file()) error_output_file()->close();
81+
82+
std::abort();
83+
}
84+
5085

5186
}

test/unit_tests/ad/ADCounter.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,4 +61,15 @@ TEST(ADCounterTest, CounterAddedCorrectly){
6161
EXPECT_EQ(idctrs.size(), 1);
6262
EXPECT_EQ(*idctrs.begin(), clist.begin());
6363

64+
//Try adding a counter event not in the counter list and ensure it throws an error
65+
Event_t ev2 = createCounterEvent_t(pid, rid, tid, 101, value, ts);
66+
bool caught_error = false;
67+
try{
68+
counters.addCounter(ev2);
69+
}catch(const std::exception &e){
70+
std::cout << "Caught expected exception: " << e.what() << std::endl;
71+
caught_error = true;
72+
}
73+
EXPECT_TRUE(caught_error);
74+
6475
}

test/unit_tests/util/error.cpp

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,15 @@ std::string removeDateTime(const std::string &from){
99
if(pos == std::string::npos) return from;
1010
return from.substr(pos+2);
1111
}
12-
12+
13+
void donothing(){}
1314

1415
TEST(TestError, runTests){
1516
//Test with default rank
1617
{
1718
std::stringstream ss;
18-
g_error.setStream(&ss);
19-
g_error.recoverable("hello", "test_func", "this_file", 129);
19+
Error().setStream(&ss);
20+
Error().recoverable("hello", "test_func", "this_file", 129);
2021

2122
std::string e = removeDateTime(ss.str());
2223
EXPECT_EQ(e, "Error (recoverable) : test_func (this_file:129) : hello\n");
@@ -26,7 +27,7 @@ TEST(TestError, runTests){
2627
{
2728
std::stringstream ss;
2829
set_error_output_stream(22, &ss);
29-
g_error.recoverable("hello", "test_func", "this_file", 129);
30+
Error().recoverable("hello", "test_func", "this_file", 129);
3031

3132
std::string e = removeDateTime(ss.str());
3233
EXPECT_EQ(e, "Error (recoverable) rank 22 : test_func (this_file:129) : hello\n");
@@ -53,31 +54,36 @@ TEST(TestError, runTests){
5354
std::stringstream ss;
5455
set_error_output_stream(22, &ss);
5556
bool caught = false;
56-
std::string thrown_err;
57-
unsigned long line = __LINE__ + 2;
57+
unsigned long line = __LINE__ + 3;
58+
std::runtime_error thrown_err("pooh");
5859
try{
5960
fatal_error("hello");
60-
}catch(std::exception &e){
61-
thrown_err = removeDateTime(e.what());
61+
}catch(const std::runtime_error &e){
62+
std::cout << "Caught " << e.what() << std::endl;
63+
thrown_err = e;
6264
caught = true;
6365
}
6466

6567
EXPECT_EQ(caught, true);
6668

67-
std::string e = removeDateTime(ss.str());
68-
6969
std::stringstream expect;
7070
expect << "Error (FATAL) rank 22 : " << __func__ << " (" << __FILE__ << ":" << line <<") : hello\n";
71-
72-
std::cout << e << std::endl;
73-
std::cout << thrown_err << std::endl;
74-
75-
EXPECT_EQ(e, expect.str());
76-
EXPECT_EQ(thrown_err, expect.str());
77-
}
7871

72+
std::string thrown_err_str = removeDateTime(thrown_err.what());
7973

74+
std::cout << thrown_err_str << std::endl;
75+
EXPECT_EQ(thrown_err_str, expect.str());
76+
77+
//Normally fatal errors are only written to the error stream if they are uncaught
78+
//As we caught the error above it will not be in the error stream unless we do an explicit flush
79+
EXPECT_EQ(ss.str().size(), 0);
8080

81+
Error().flushError(thrown_err);
8182

83+
std::string e = removeDateTime(ss.str());
84+
85+
std::cout << e << std::endl;
8286

87+
EXPECT_EQ(e, expect.str());
88+
}
8389
}

0 commit comments

Comments
 (0)