Skip to content

Commit 2aab945

Browse files
committed
Replaced ADIOS2::BeginStep library-implemented timeout with a manual timeout loop in hopes to avoid SST hangs on Summit + unit test
1 parent c6f7bc0 commit 2aab945

2 files changed

Lines changed: 71 additions & 9 deletions

File tree

src/ad/ADParser.cpp

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -72,17 +72,38 @@ ADParser::~ADParser() {
7272

7373
int ADParser::beginStep(bool verbose) {
7474
if (m_opened){
75-
adios2::StepStatus status = m_reader.BeginStep(adios2::StepMode::Read, float(m_beginstep_timeout));
76-
if (status == adios2::StepStatus::OK){
77-
m_current_step++;
78-
}else{
79-
if(status == adios2::StepStatus::NotReady){ recoverable_error("ADParser::beginStep : ADIOS2::BeginStep timed out waiting for next step to be ready\n"); }
80-
else if(status == adios2::StepStatus::EndOfStream){ headProgressStream(m_rank) << "ADParser::beginStep rank 0 detected end of data stream" << std::endl; }
81-
else{ recoverable_error("ADParser::beginStep : ADIOS2::BeginStep returned an unknown error\n"); }
82-
m_status = false;
83-
m_current_step = -1;
75+
typedef std::chrono::high_resolution_clock Clock;
76+
auto start = Clock::now();
77+
78+
while(1){
79+
adios2::StepStatus status = m_reader.BeginStep(adios2::StepMode::Read, 0.0f);
80+
if (status == adios2::StepStatus::OK){
81+
m_current_step++;
82+
break;
83+
}else if(status == adios2::StepStatus::NotReady){
84+
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(Clock::now() - start).count();
85+
if(elapsed > m_beginstep_timeout * 1000){
86+
recoverable_error("ADParser::beginStep : ADIOS2::BeginStep timed out waiting for next step to be ready\n");
87+
m_status = false;
88+
m_current_step = -1;
89+
break;
90+
}else{
91+
std::this_thread::sleep_for (std::chrono::seconds(1));
92+
}
93+
}else if(status == adios2::StepStatus::EndOfStream){
94+
headProgressStream(m_rank) << "ADParser::beginStep rank 0 detected end of data stream" << std::endl;
95+
m_status = false;
96+
m_current_step = -1;
97+
break;
98+
}else{
99+
recoverable_error("ADParser::beginStep : ADIOS2::BeginStep returned an unknown error\n");
100+
m_status = false;
101+
m_current_step = -1;
102+
break;
103+
}
84104
}
85105
}
106+
86107
return m_current_step;
87108
}
88109

test/unit_tests/ad/ADParser.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,47 @@ TEST(ADParserTestbeginStep, stepStartEndOK){
178178
}
179179

180180

181+
TEST(ADParserTestbeginStep, stepStartTimeoutWorks){
182+
int step_idx = 0;
183+
double time = 0;
184+
bool status = true;
185+
bool exc = false;
186+
try{
187+
SSTrw rw;
188+
189+
std::thread wthr([&](){
190+
rw.openWriter();
191+
//rw.wr.BeginStep();
192+
//rw.wr.EndStep();
193+
rw.closeWriter();
194+
});
195+
196+
std::thread rthr([&](){
197+
rw.openReader();
198+
rw.parser->setBeginStepTimeout(5);
199+
typedef std::chrono::high_resolution_clock Clock;
200+
auto start = Clock::now();
201+
step_idx = rw.parser->beginStep(true);
202+
time = std::chrono::duration_cast<std::chrono::milliseconds>(Clock::now() - start).count();
203+
status = rw.parser->getStatus();
204+
rw.closeReader();
205+
});
206+
207+
rthr.join();
208+
wthr.join();
209+
}catch(std::exception &e){
210+
std::cout << "Caught exception:\n" << e.what() << std::endl;
211+
exc = true;
212+
}
213+
EXPECT_EQ(exc, false);
214+
EXPECT_EQ(step_idx, -1);
215+
EXPECT_GT(time, 4999);
216+
EXPECT_EQ(status, false);
217+
}
218+
219+
220+
221+
181222

182223
TEST(ADParserTestAttributeIO, funcAttributeCommunicated){
183224
bool err = false;

0 commit comments

Comments
 (0)