Skip to content

Commit 9f629f3

Browse files
committed
In ADEvent, detection of multiple correlation IDs for GPU events is no longer a fatal error but instead a recoverable error. Modified unit test accordingly.
ADAnomalyProvenance Detection of multiple correlation IDs for GPU events is no longer a fatal error; instead the provenance data is provided with an error message to record Missing correlation IDs or cases where the parent event has somehow become deleted now also record error messages to the provenance data Added unit tests of the above
1 parent e765b82 commit 9f629f3

4 files changed

Lines changed: 226 additions & 53 deletions

File tree

src/ad/ADAnomalyProvenance.cpp

Lines changed: 48 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -49,46 +49,56 @@ void ADAnomalyProvenance::getGPUeventInfo(const ExecData_t &call, const ADEvent
4949
//Note a GPU event can only be partnered to one CPU event but a CPU event can be partnered to multiple GPU events
5050
if(call.n_GPU_correlationID_partner() != 1){
5151
std::stringstream ss; ss << "GPU event has multiple correlation ID partners?? Event details:" << std::endl << call.get_json(false,true).dump() << std::endl;
52-
fatal_error(ss.str());
53-
}
54-
55-
verboseStream << "Call has a GPU correlation ID partner: " << call.get_GPU_correlationID_partner(0).toString() << std::endl;
56-
57-
const eventID &gpu_event_parent = call.get_GPU_correlationID_partner(0);
58-
m_gpu_event_parent_info["event_id"] = gpu_event_parent.toString();
59-
60-
//Get the parent event
61-
CallListIterator_t pit;
62-
bool got_parent = true;
63-
try{
64-
pit = event_man.getCallData(gpu_event_parent);
65-
}catch(const std::exception &e){
66-
recoverable_error("Could not find GPU parent " + gpu_event_parent.toString() + " in call list due to : " + e.what());
67-
got_parent = false;
68-
}
69-
70-
if(got_parent){
71-
m_gpu_event_parent_info["tid"] = pit->get_tid();
72-
73-
//Generate the parent stack
74-
nlohmann::json gpu_event_parent_stack = nlohmann::json::array();
75-
gpu_event_parent_stack.push_back(getCallStackEntry(*pit));
76-
77-
eventID parent = pit->get_parent();
78-
while(parent != eventID::root()){
79-
CallListIterator_t call_it;
80-
try{
81-
call_it = event_man.getCallData(parent);
82-
}catch(const std::exception &e){
83-
recoverable_error("Could not find GPU stack event parent " + parent.toString() + " in call list due to : " + e.what());
84-
break;
52+
recoverable_error(ss.str());
53+
54+
m_gpu_event_parent_info = "Chimbuko error: Multiple host parent event correlation IDs found, likely due to trace corruption";
55+
}else{
56+
verboseStream << "Call has a GPU correlation ID partner: " << call.get_GPU_correlationID_partner(0).toString() << std::endl;
57+
58+
const eventID &gpu_event_parent = call.get_GPU_correlationID_partner(0);
59+
m_gpu_event_parent_info["event_id"] = gpu_event_parent.toString();
60+
61+
//Get the parent event
62+
CallListIterator_t pit;
63+
bool got_parent = true;
64+
try{
65+
pit = event_man.getCallData(gpu_event_parent);
66+
}catch(const std::exception &e){
67+
recoverable_error("Could not find GPU parent " + gpu_event_parent.toString() + " in call list due to : " + e.what());
68+
got_parent = false;
69+
}
70+
71+
if(got_parent){
72+
m_gpu_event_parent_info["tid"] = pit->get_tid();
73+
74+
//Generate the parent stack
75+
nlohmann::json gpu_event_parent_stack = nlohmann::json::array();
76+
gpu_event_parent_stack.push_back(getCallStackEntry(*pit));
77+
78+
eventID parent = pit->get_parent();
79+
while(parent != eventID::root()){
80+
CallListIterator_t call_it;
81+
try{
82+
call_it = event_man.getCallData(parent);
83+
}catch(const std::exception &e){
84+
recoverable_error("Could not find GPU stack event parent " + parent.toString() + " in call list due to : " + e.what());
85+
break;
86+
}
87+
gpu_event_parent_stack.push_back(getCallStackEntry(*call_it));
88+
parent = call_it->get_parent();
8589
}
86-
gpu_event_parent_stack.push_back(getCallStackEntry(*call_it));
87-
parent = call_it->get_parent();
90+
m_gpu_event_parent_info["call_stack"] = std::move(gpu_event_parent_stack);
91+
}else{
92+
//Could not get parent event
93+
m_gpu_event_parent_info = "Chimbuko error: Host parent event could not be reached";
8894
}
89-
m_gpu_event_parent_info["call_stack"] = std::move(gpu_event_parent_stack);
90-
}
91-
}//have correlation ID partner
95+
96+
}//have *one* correlation ID partner
97+
}else{
98+
//No correlation ID recorded
99+
m_gpu_event_parent_info = "Chimbuko error: Correlation ID of host parent event was not recorded";
100+
}
101+
92102
}//m_is_gpu_event
93103
}
94104

src/ad/ADEvent.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ void ADEvent::checkAndMatchCorrelationID(CallListIterator_t it){
119119
<< it->get_json(false, true).dump() << std::endl
120120
<< "GPU details: " << std::endl
121121
<< m_gpu_thread_Map->find(it->get_tid())->second.get_json().dump() << std::endl;
122-
fatal_error(ss.str());
122+
recoverable_error(ss.str());
123123
}
124124
}
125125

test/unit_tests/ad/ADAnomalyProvenance.cpp

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,162 @@ TEST(TestADAnomalyProvenance, detectsGPUevents){
249249

250250

251251

252+
TEST(TestADAnomalyProvenance, gracefullyFailsIfCorrelationIDissues){
253+
int gpu_thr = 9;
254+
int corrid_cid = 22; //counter index!
255+
256+
ADEvent event_man;
257+
258+
//Populate all the other stuff required to generate anomaly data
259+
RunStats stats;
260+
for(int i=0;i<50;i++)
261+
stats.push(double(i));
262+
263+
SstdParam param;
264+
param[44] = stats;
265+
param[55] = stats;
266+
param[11] = stats;
267+
268+
ADCounter counter;
269+
270+
ADMetadataParser metadata;
271+
std::vector<MetaData_t> mdata = {
272+
MetaData_t(0,0, gpu_thr, "CUDA Context", "8"),
273+
MetaData_t(0,0, gpu_thr, "CUDA Stream", "1"),
274+
MetaData_t(0,0, gpu_thr, "CUDA Device", "7"),
275+
MetaData_t(0,0, gpu_thr, "GPU[7] Device Name", "Fake GPU")
276+
};
277+
metadata.addData(mdata);
278+
279+
{
280+
std::cout << "Testing failure due to missing correlation ID" << std::endl;
281+
282+
//Have a host correlation ID but not a device one
283+
int corridx1 = 1234;
284+
ExecData_t exec_gpu = createFuncExecData_t(0,1, gpu_thr, 55, "thegpufunction", 1000, 100); //on gpu
285+
286+
ExecData_t exec_cpu = createFuncExecData_t(0,1, 0, 44, "thecpufunction", 1000, 100); //not on gpu
287+
exec_cpu.add_counter(createCounterData_t(0,1, 0, corrid_cid, corridx1, 1000, "Correlation ID"));
288+
289+
CallListIterator_t exec_cpu_it = event_man.addCall(exec_cpu);
290+
CallListIterator_t exec_gpu_it = event_man.addCall(exec_gpu);
291+
292+
ADAnomalyProvenance prov_gpu(*exec_gpu_it,
293+
event_man,
294+
param,
295+
counter, metadata, 0,
296+
11,900,1200);
297+
{
298+
nlohmann::json output = prov_gpu.get_json();
299+
std::cout << "For GPU event, got: " << output.dump() << std::endl;
300+
301+
EXPECT_EQ(output["is_gpu_event"], true);
302+
EXPECT_EQ(output["gpu_location"]["context"], 8);
303+
EXPECT_EQ(output["gpu_location"]["device"], 7);
304+
EXPECT_EQ(output["gpu_location"]["stream"], 1);
305+
306+
std::string got = output["gpu_parent"];
307+
std::string expect = "Chimbuko error: Correlation ID of host parent event was not recorded";
308+
std::cout << got << std::endl;
309+
310+
EXPECT_EQ(got, expect);
311+
}
312+
}
313+
314+
//Failure due to multiple correlation IDs
315+
{
316+
std::cout << "Testing failure due to multiple correlation IDs" << std::endl;
317+
int corridx2 = 2222, corridx3 = 3333;
318+
319+
ExecData_t exec_gpu = createFuncExecData_t(0,1, gpu_thr, 55, "thegpufunction", 1000, 100); //on gpu
320+
exec_gpu.add_counter(createCounterData_t(0,1, gpu_thr, corrid_cid, corridx2, 1000, "Correlation ID")); //this one has 2 correlation IDs
321+
exec_gpu.add_counter(createCounterData_t(0,1, gpu_thr, corrid_cid, corridx3, 1000, "Correlation ID"));
322+
323+
ExecData_t exec_cpu = createFuncExecData_t(0,1, 0, 44, "thecpufunction", 1000, 100); //not on gpu
324+
exec_cpu.add_counter(createCounterData_t(0,1, 0, corrid_cid, corridx2, 1000, "Correlation ID"));
325+
326+
ExecData_t exec_cpu2 = createFuncExecData_t(0,1, 0, 66, "theothercpufunction", 1000, 100); //not on gpu
327+
exec_cpu2.add_counter(createCounterData_t(0,1, 0, corrid_cid, corridx3, 1000, "Correlation ID"));
328+
329+
CallListIterator_t exec_cpu_it = event_man.addCall(exec_cpu);
330+
CallListIterator_t exec_cpu2_it = event_man.addCall(exec_cpu2);
331+
332+
CallListIterator_t exec_gpu_it = event_man.addCall(exec_gpu);
333+
334+
ADAnomalyProvenance prov_gpu(*exec_gpu_it,
335+
event_man,
336+
param,
337+
counter, metadata, 0,
338+
11,900,1200);
339+
{
340+
nlohmann::json output = prov_gpu.get_json();
341+
std::cout << "For GPU event, got: " << output.dump() << std::endl;
342+
343+
EXPECT_EQ(output["is_gpu_event"], true);
344+
EXPECT_EQ(output["gpu_location"]["context"], 8);
345+
EXPECT_EQ(output["gpu_location"]["device"], 7);
346+
EXPECT_EQ(output["gpu_location"]["stream"], 1);
347+
348+
std::string got = output["gpu_parent"];
349+
std::string expect = "Chimbuko error: Multiple host parent event correlation IDs found, likely due to trace corruption";
350+
std::cout << got << std::endl;
351+
352+
EXPECT_EQ(got, expect);
353+
}
354+
}
355+
356+
{
357+
std::cout << "Testing failure due to missing parent event" << std::endl;
358+
359+
//Have a host correlation ID but not a device one
360+
int corridx4 = 4444;
361+
ExecData_t exec_gpu = createFuncExecData_t(0,1, gpu_thr, 55, "thegpufunction", 1000, 100); //on gpu
362+
exec_gpu.add_counter(createCounterData_t(0,1, gpu_thr, corrid_cid, corridx4, 1000, "Correlation ID")); //this one has 2 correlation IDs
363+
364+
ExecData_t exec_cpu = createFuncExecData_t(0,1, 0, 44, "thecpufunction", 1000, 100); //not on gpu
365+
exec_cpu.add_counter(createCounterData_t(0,1, 0, corrid_cid, corridx4, 1000, "Correlation ID"));
366+
367+
CallListIterator_t exec_cpu_it = event_man.addCall(exec_cpu);
368+
369+
//Force the trimming out of the cpu event
370+
exec_cpu_it->can_delete(true);
371+
delete event_man.trimCallList();
372+
373+
CallListIterator_t exec_gpu_it = event_man.addCall(exec_gpu);
374+
375+
ADAnomalyProvenance prov_gpu(*exec_gpu_it,
376+
event_man,
377+
param,
378+
counter, metadata, 0,
379+
11,900,1200);
380+
{
381+
nlohmann::json output = prov_gpu.get_json();
382+
std::cout << "For GPU event, got: " << output.dump() << std::endl;
383+
384+
EXPECT_EQ(output["is_gpu_event"], true);
385+
EXPECT_EQ(output["gpu_location"]["context"], 8);
386+
EXPECT_EQ(output["gpu_location"]["device"], 7);
387+
EXPECT_EQ(output["gpu_location"]["stream"], 1);
388+
389+
std::string got = output["gpu_parent"];
390+
std::string expect = "Chimbuko error: Host parent event could not be reached";
391+
std::cout << got << std::endl;
392+
393+
EXPECT_EQ(got, expect);
394+
}
395+
}
396+
397+
398+
399+
400+
401+
}
402+
403+
404+
405+
406+
407+
252408
TEST(TestADAnomalyProvenance, extractsExecWindow){
253409
ExecData_t exec0 = createFuncExecData_t(1,2,3, 33, "theonebefore", 900, 0); //not yet completed
254410
ExecData_t exec1 = createFuncExecData_t(1,2,3, 55, "theparent", 1000, 100);

test/unit_tests/ad/ADEvent.cpp

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include<chimbuko/ad/ADEvent.hpp>
22
#include<chimbuko/util/map.hpp>
3+
#include<chimbuko/util/error.hpp>
34
#include "gtest/gtest.h"
45
#include "../unit_test_common.hpp"
56

@@ -662,25 +663,31 @@ TEST(ADEventTest, DetectsCorrelationIDerrors){
662663
event_man.addCounter(gpu_corrid1);
663664
event_man.addCounter(gpu_corrid2);
664665

665-
//GPU events not allowed to have multiple correlation IDs
666-
bool got_error = false;
667-
try{
666+
//GPU events not allowed to have multiple correlation IDs (non-fatal)
667+
{
668+
std::stringstream err_str;
669+
Error().setStream(&err_str);
670+
668671
event_man.addFunc(gpu_exit);
669-
}catch(const std::exception &err){
670-
std::cout << "Caught intentional error: " << err.what() << std::endl;
671-
got_error = true;
672+
673+
std::string got = err_str.str();
674+
size_t loc = got.find("Encountered a GPU kernel execution with multiple correlation IDs!");
675+
std::cout << "Got intentional error: " << got << std::endl;
676+
EXPECT_NE(loc, std::string::npos);
672677
}
673-
EXPECT_EQ(got_error, true);
674678

675679
//CPU events are allowed to have multiple correlation IDs
676-
event_man.addCounter(cpu_corrid2);
680+
{
681+
event_man.addCounter(cpu_corrid2);
682+
683+
std::stringstream err_str;
684+
Error().setStream(&err_str);
677685

678-
got_error = false;
679-
try{
680686
event_man.addFunc(cpu_exit);
681-
}catch(const std::exception &err){
682-
std::cout << "Caught *un*intentional error: " << err.what() << std::endl;
683-
got_error = true;
687+
688+
std::string got = err_str.str();
689+
size_t loc = got.find("Encountered a GPU kernel execution with multiple correlation IDs!");
690+
std::cout << "This should be empty: " << got << std::endl;
691+
EXPECT_EQ(loc, std::string::npos);
684692
}
685-
EXPECT_EQ(got_error, false);
686693
}

0 commit comments

Comments
 (0)