@@ -249,6 +249,162 @@ TEST(TestADAnomalyProvenance, detectsGPUevents){
249249
250250
251251
252+ TEST (TestADAnomalyProvenance, gracefullyFailsIfCorrelationIDissues){
253+ int gpu_thr = 9 ;
254+ int corrid_cid = 22 ; // counter index!
255+
256+ ADEvent event_man;
257+
258+ // Populate all the other stuff required to generate anomaly data
259+ RunStats stats;
260+ for (int i=0 ;i<50 ;i++)
261+ stats.push (double (i));
262+
263+ SstdParam param;
264+ param[44 ] = stats;
265+ param[55 ] = stats;
266+ param[11 ] = stats;
267+
268+ ADCounter counter;
269+
270+ ADMetadataParser metadata;
271+ std::vector<MetaData_t> mdata = {
272+ MetaData_t (0 ,0 , gpu_thr, " CUDA Context" , " 8" ),
273+ MetaData_t (0 ,0 , gpu_thr, " CUDA Stream" , " 1" ),
274+ MetaData_t (0 ,0 , gpu_thr, " CUDA Device" , " 7" ),
275+ MetaData_t (0 ,0 , gpu_thr, " GPU[7] Device Name" , " Fake GPU" )
276+ };
277+ metadata.addData (mdata);
278+
279+ {
280+ std::cout << " Testing failure due to missing correlation ID" << std::endl;
281+
282+ // Have a host correlation ID but not a device one
283+ int corridx1 = 1234 ;
284+ ExecData_t exec_gpu = createFuncExecData_t (0 ,1 , gpu_thr, 55 , " thegpufunction" , 1000 , 100 ); // on gpu
285+
286+ ExecData_t exec_cpu = createFuncExecData_t (0 ,1 , 0 , 44 , " thecpufunction" , 1000 , 100 ); // not on gpu
287+ exec_cpu.add_counter (createCounterData_t (0 ,1 , 0 , corrid_cid, corridx1, 1000 , " Correlation ID" ));
288+
289+ CallListIterator_t exec_cpu_it = event_man.addCall (exec_cpu);
290+ CallListIterator_t exec_gpu_it = event_man.addCall (exec_gpu);
291+
292+ ADAnomalyProvenance prov_gpu (*exec_gpu_it,
293+ event_man,
294+ param,
295+ counter, metadata, 0 ,
296+ 11 ,900 ,1200 );
297+ {
298+ nlohmann::json output = prov_gpu.get_json ();
299+ std::cout << " For GPU event, got: " << output.dump () << std::endl;
300+
301+ EXPECT_EQ (output[" is_gpu_event" ], true );
302+ EXPECT_EQ (output[" gpu_location" ][" context" ], 8 );
303+ EXPECT_EQ (output[" gpu_location" ][" device" ], 7 );
304+ EXPECT_EQ (output[" gpu_location" ][" stream" ], 1 );
305+
306+ std::string got = output[" gpu_parent" ];
307+ std::string expect = " Chimbuko error: Correlation ID of host parent event was not recorded" ;
308+ std::cout << got << std::endl;
309+
310+ EXPECT_EQ (got, expect);
311+ }
312+ }
313+
314+ // Failure due to multiple correlation IDs
315+ {
316+ std::cout << " Testing failure due to multiple correlation IDs" << std::endl;
317+ int corridx2 = 2222 , corridx3 = 3333 ;
318+
319+ ExecData_t exec_gpu = createFuncExecData_t (0 ,1 , gpu_thr, 55 , " thegpufunction" , 1000 , 100 ); // on gpu
320+ exec_gpu.add_counter (createCounterData_t (0 ,1 , gpu_thr, corrid_cid, corridx2, 1000 , " Correlation ID" )); // this one has 2 correlation IDs
321+ exec_gpu.add_counter (createCounterData_t (0 ,1 , gpu_thr, corrid_cid, corridx3, 1000 , " Correlation ID" ));
322+
323+ ExecData_t exec_cpu = createFuncExecData_t (0 ,1 , 0 , 44 , " thecpufunction" , 1000 , 100 ); // not on gpu
324+ exec_cpu.add_counter (createCounterData_t (0 ,1 , 0 , corrid_cid, corridx2, 1000 , " Correlation ID" ));
325+
326+ ExecData_t exec_cpu2 = createFuncExecData_t (0 ,1 , 0 , 66 , " theothercpufunction" , 1000 , 100 ); // not on gpu
327+ exec_cpu2.add_counter (createCounterData_t (0 ,1 , 0 , corrid_cid, corridx3, 1000 , " Correlation ID" ));
328+
329+ CallListIterator_t exec_cpu_it = event_man.addCall (exec_cpu);
330+ CallListIterator_t exec_cpu2_it = event_man.addCall (exec_cpu2);
331+
332+ CallListIterator_t exec_gpu_it = event_man.addCall (exec_gpu);
333+
334+ ADAnomalyProvenance prov_gpu (*exec_gpu_it,
335+ event_man,
336+ param,
337+ counter, metadata, 0 ,
338+ 11 ,900 ,1200 );
339+ {
340+ nlohmann::json output = prov_gpu.get_json ();
341+ std::cout << " For GPU event, got: " << output.dump () << std::endl;
342+
343+ EXPECT_EQ (output[" is_gpu_event" ], true );
344+ EXPECT_EQ (output[" gpu_location" ][" context" ], 8 );
345+ EXPECT_EQ (output[" gpu_location" ][" device" ], 7 );
346+ EXPECT_EQ (output[" gpu_location" ][" stream" ], 1 );
347+
348+ std::string got = output[" gpu_parent" ];
349+ std::string expect = " Chimbuko error: Multiple host parent event correlation IDs found, likely due to trace corruption" ;
350+ std::cout << got << std::endl;
351+
352+ EXPECT_EQ (got, expect);
353+ }
354+ }
355+
356+ {
357+ std::cout << " Testing failure due to missing parent event" << std::endl;
358+
359+ // Have a host correlation ID but not a device one
360+ int corridx4 = 4444 ;
361+ ExecData_t exec_gpu = createFuncExecData_t (0 ,1 , gpu_thr, 55 , " thegpufunction" , 1000 , 100 ); // on gpu
362+ exec_gpu.add_counter (createCounterData_t (0 ,1 , gpu_thr, corrid_cid, corridx4, 1000 , " Correlation ID" )); // this one has 2 correlation IDs
363+
364+ ExecData_t exec_cpu = createFuncExecData_t (0 ,1 , 0 , 44 , " thecpufunction" , 1000 , 100 ); // not on gpu
365+ exec_cpu.add_counter (createCounterData_t (0 ,1 , 0 , corrid_cid, corridx4, 1000 , " Correlation ID" ));
366+
367+ CallListIterator_t exec_cpu_it = event_man.addCall (exec_cpu);
368+
369+ // Force the trimming out of the cpu event
370+ exec_cpu_it->can_delete (true );
371+ delete event_man.trimCallList ();
372+
373+ CallListIterator_t exec_gpu_it = event_man.addCall (exec_gpu);
374+
375+ ADAnomalyProvenance prov_gpu (*exec_gpu_it,
376+ event_man,
377+ param,
378+ counter, metadata, 0 ,
379+ 11 ,900 ,1200 );
380+ {
381+ nlohmann::json output = prov_gpu.get_json ();
382+ std::cout << " For GPU event, got: " << output.dump () << std::endl;
383+
384+ EXPECT_EQ (output[" is_gpu_event" ], true );
385+ EXPECT_EQ (output[" gpu_location" ][" context" ], 8 );
386+ EXPECT_EQ (output[" gpu_location" ][" device" ], 7 );
387+ EXPECT_EQ (output[" gpu_location" ][" stream" ], 1 );
388+
389+ std::string got = output[" gpu_parent" ];
390+ std::string expect = " Chimbuko error: Host parent event could not be reached" ;
391+ std::cout << got << std::endl;
392+
393+ EXPECT_EQ (got, expect);
394+ }
395+ }
396+
397+
398+
399+
400+
401+ }
402+
403+
404+
405+
406+
407+
252408TEST (TestADAnomalyProvenance, extractsExecWindow){
253409 ExecData_t exec0 = createFuncExecData_t (1 ,2 ,3 , 33 , " theonebefore" , 900 , 0 ); // not yet completed
254410 ExecData_t exec1 = createFuncExecData_t (1 ,2 ,3 , 55 , " theparent" , 1000 , 100 );
0 commit comments