Skip to content

Commit 1c8949e

Browse files
committed
Replaced ADEvent's / ExecData_t's method of labeling events as deletable by the garbage collection with a reference count strategy where events are only trimmed when the reference count is 0
This fixes a bug found where common parents of host events that have GPU events were getting prematurely trimmed Added a unit test for the above case Implemented ADEvent::clear
1 parent 6860bde commit 1c8949e

6 files changed

Lines changed: 148 additions & 45 deletions

File tree

include/chimbuko/ad/ADEvent.hpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ namespace chimbuko {
252252
std::pair<CallListIterator_t, CallListIterator_t> getCallWindowStartEnd(const eventID &event_id, const int win_size) const override;
253253

254254
/**
255-
* @brief clear
255+
* @brief Clear all data members
256256
*
257257
*/
258258
void clear();
@@ -405,11 +405,6 @@ namespace chimbuko {
405405
*/
406406
std::unordered_map<unsigned long, CallListIterator_t> m_unmatchedCorrelationID;
407407

408-
/**
409-
* @brief Map of event index to the number of unmatched correlation IDs
410-
*/
411-
std::unordered_map<eventID,size_t> m_unmatchedCorrelationID_count;
412-
413408
/**
414409
* @brief verbose
415410
*

include/chimbuko/ad/ExecData.hpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -594,13 +594,22 @@ class ExecData_t {
594594
/**
595595
* @brief Determine whether the event can be deleted by the garbage collection at the end of the io step
596596
*/
597-
bool can_delete() const{ return m_can_delete;}
597+
inline bool can_delete() const{ return m_references == 0;}
598598

599599
/**
600-
* @brief Set whether the event can be deleted by the garbage collection at the end of the io step (default true)
601-
*/
602-
void can_delete(const bool v){ m_can_delete = v; }
600+
* @brief Increment the external reference counter, preventing object deletion
601+
*/
602+
inline void register_reference(){ ++m_references; }
603+
604+
/**
605+
* @brief Decrement the external reference counter, allowing object deletion if 0
606+
*/
607+
void deregister_reference();
603608

609+
/**
610+
* @brief Get the number of external references registered
611+
*/
612+
inline unsigned long reference_count() const{ return m_references; }
604613

605614
/**
606615
* @brief Set the partner event linked by a GPU correlation ID
@@ -645,7 +654,7 @@ class ExecData_t {
645654
unsigned long m_n_messages; /**< the number of messages */
646655
std::deque<CommData_t> m_messages; /**< a vector of all messages */
647656
std::deque<CounterData_t> m_counters; /**< a vector of all counters */
648-
bool m_can_delete; /**< Flag indicating that the event is deletable by the garbage collection */
657+
unsigned long m_references; /**< track number of external references to object. When 0 the object can be deleted */
649658
std::vector<eventID> m_gpu_correlation_id_partner; /**< The event ids partner events linked by a correlation ID, either the launching CPU event or the GPU kernel event */
650659
};
651660

src/ad/ADEvent.cpp

Lines changed: 27 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,20 @@ ADEvent::ADEvent(bool verbose)
1515
}
1616

1717
ADEvent::~ADEvent() {
18-
clear();
1918
}
2019

2120
void ADEvent::clear() {
21+
m_eidx_func_entry = -1;
22+
m_eidx_func_exit = -1;
23+
m_eidx_comm_send = -1;
24+
m_eidx_comm_recv = -1;
25+
m_commStack.clear();
26+
m_counterStack.clear();
27+
m_callStack.clear();
28+
m_callList.clear();
29+
m_execDataMap.clear();
30+
m_callIDMap.clear();
31+
m_unmatchedCorrelationID.clear();
2232
}
2333

2434
EventError ADEvent::addEvent(const Event_t& event) {
@@ -32,7 +42,8 @@ EventError ADEvent::addEvent(const Event_t& event) {
3242
}
3343

3444
void ADEvent::stackProtectGC(CallListIterator_t it){
35-
it->can_delete(false);
45+
verboseStream << "ADEvent::stackProtectGC incrementing register count of " << it->get_id().toString() << " to " << it->reference_count()+1 << std::endl;
46+
it->register_reference();
3647

3748
eventID parent = it->get_parent();
3849
while(parent != eventID::root()){
@@ -43,18 +54,15 @@ void ADEvent::stackProtectGC(CallListIterator_t it){
4354
recoverable_error("Could not find parent " + parent.toString() + " in call list due to : " + e.what());
4455
break;
4556
}
46-
47-
pit->can_delete(false);
57+
verboseStream << "ADEvent::stackProtectGC incrementing register count of " << pit->get_id().toString() << " to " << pit->reference_count()+1 << std::endl;
58+
pit->register_reference();
4859
parent = pit->get_parent();
4960
}
5061
}
5162

5263
void ADEvent::stackUnProtectGC(CallListIterator_t it){
53-
//Check if has unmatched correlation iD
54-
auto umit = m_unmatchedCorrelationID_count.find(it->get_id());
55-
if(umit == m_unmatchedCorrelationID_count.end() || umit->second == 0)
56-
it->can_delete(true);
57-
else return; //stop here, the stack will be needed
64+
verboseStream << "ADEvent::stackUnProtectGC decrementing register count of " << it->get_id().toString() << " to " << it->reference_count()-1 << std::endl;
65+
it->deregister_reference();
5866

5967
eventID parent = it->get_parent();
6068
while(parent != eventID::root()){
@@ -65,12 +73,8 @@ void ADEvent::stackUnProtectGC(CallListIterator_t it){
6573
recoverable_error("Could not find parent " + parent.toString() + " in call list due to : " + e.what());
6674
break;
6775
}
68-
69-
umit = m_unmatchedCorrelationID_count.find(pit->get_id());
70-
if(umit == m_unmatchedCorrelationID_count.end() || umit->second == 0)
71-
pit->can_delete(true);
72-
else break; //stop here, the stack will be needed
73-
76+
verboseStream << "ADEvent::stackUnProtectGC decrementing register count of " << pit->get_id().toString() << " to " << pit->reference_count()-1 << std::endl;
77+
pit->deregister_reference();
7478
parent = pit->get_parent();
7579
}
7680
}
@@ -85,29 +89,27 @@ void ADEvent::checkAndMatchCorrelationID(CallListIterator_t it){
8589
for(auto const &c : it->get_counters()){
8690
if(c.get_countername() == "Correlation ID"){
8791
unsigned long cid = c.get_value();
92+
eventID current_event_id = it->get_id();
8893

8994
//Does a partner already exist?
9095
auto m = m_unmatchedCorrelationID.find(cid);
9196
if(m != m_unmatchedCorrelationID.end()){
92-
eventID current_event_id = it->get_id();
9397
eventID partner_event_id = m->second->get_id();
98+
99+
//Check partner event hasn't been accidentally deleted (thus invalidating its iterator)
100+
if(m_callIDMap.find(partner_event_id) == m_callIDMap.end()) fatal_error("Correlation ID partner event is not in the call list!");
101+
94102
it->set_GPU_correlationID_partner(partner_event_id);
95103
m->second->set_GPU_correlationID_partner(current_event_id);
96104

97-
size_t rem = --m_unmatchedCorrelationID_count[partner_event_id];
98-
if(rem == 0){
99-
m_unmatchedCorrelationID_count.erase(partner_event_id); //no need to keep this around now
100-
stackUnProtectGC(m->second);
101-
}
105+
verboseStream << "Current event " << current_event_id.toString() << " is partnered with previous unmatched event " << partner_event_id.toString() << " with correlation ID " << cid << std::endl;
106+
stackUnProtectGC(m->second);
102107
m_unmatchedCorrelationID.erase(cid); //remove now-matched correlation ID
103-
104-
verboseStream << "Found partner event " << current_event_id.toString() << " to previous unmatched event " << partner_event_id.toString() << " with correlation ID " << cid << std::endl;
105108
}else{
106109
//Ensure the event and it's parental line can't be deleted and put it in the map of unmatched events
110+
verboseStream << "Found as-yet unpartnered event " << current_event_id.toString() << " with correlation ID " << cid << std::endl;
107111
stackProtectGC(it);
108112
m_unmatchedCorrelationID[cid] = it;
109-
++m_unmatchedCorrelationID_count[it->get_id()];
110-
verboseStream << "Found as-yet unpartnered event with correlation ID " << cid << std::endl;
111113
}
112114
n_cid++;
113115
}

src/ad/ExecData.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ ExecData_t::ExecData_t()
1010
: m_pid(0), m_rid(0), m_tid(0), m_fid(0),
1111
m_entry(0), m_exit(0), m_runtime(0), m_exclusive(0),
1212
m_n_children(0), m_n_messages(0),
13-
m_label(0), m_can_delete(true), m_gpu_correlation_id_partner(0), m_score(-1){}
13+
m_label(0), m_gpu_correlation_id_partner(0), m_score(-1), m_references(0){}
1414

1515
ExecData_t::ExecData_t(const Event_t& ev) : ExecData_t()
1616
{
@@ -39,6 +39,11 @@ ExecData_t::ExecData_t(const eventID &id, unsigned long pid, unsigned long rid,
3939

4040
ExecData_t::~ExecData_t() {}
4141

42+
void ExecData_t::deregister_reference(){
43+
if(m_references == 0) fatal_error("Attempting to deregister and event whose external reference count is 0!");
44+
--m_references;
45+
}
46+
4247
void ExecData_t::update_exit(unsigned long exit){
4348
m_exit = exit;
4449
m_runtime = m_exit - m_entry;

test/unit_tests/ad/ADAnomalyProvenance.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,8 @@ TEST(TestADAnomalyProvenance, gracefullyFailsIfCorrelationIDissues){
359359

360360
{
361361
std::cout << "Testing failure due to missing parent event" << std::endl;
362+
event_man.clear();
363+
ASSERT_EQ(event_man.getCallListSize(),0);
362364

363365
//Have a host correlation ID but not a device one
364366
int corridx4 = 4444;
@@ -369,13 +371,20 @@ TEST(TestADAnomalyProvenance, gracefullyFailsIfCorrelationIDissues){
369371
exec_cpu.add_counter(createCounterData_t(0,1, 0, corrid_cid, corridx4, 1000, "Correlation ID"));
370372

371373
CallListIterator_t exec_cpu_it = event_man.addCall(exec_cpu);
374+
CallListIterator_t exec_gpu_it = event_man.addCall(exec_gpu);
372375

373-
//Force the trimming out of the cpu event
374-
exec_cpu_it->can_delete(true);
376+
//Both events should be trimmable
377+
ASSERT_EQ(exec_cpu_it->reference_count(),0);
378+
ASSERT_EQ(exec_gpu_it->reference_count(),0);
379+
380+
//Trim out the parent
381+
exec_gpu_it->register_reference(); //protect GPU event
375382
delete event_man.trimCallList();
376-
377-
CallListIterator_t exec_gpu_it = event_man.addCall(exec_gpu);
378383

384+
ASSERT_EQ(event_man.getCallListSize(),1);
385+
std::cout << "Trimmed out event " << exec_cpu.get_id().toString() << std::endl;
386+
387+
std::cout << "Creating ADAnomalyProvenance" << std::endl;
379388
ADAnomalyProvenance prov_gpu(*exec_gpu_it,
380389
event_man,
381390
param,

test/unit_tests/ad/ADEvent.cpp

Lines changed: 87 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,10 @@ TEST(ADEvent, trimsCallListCorrectlyWithGCFlag){
454454

455455
//Add 2 events but mark 1 to be undeletable
456456
event_man.addCall(c1);
457-
c2.can_delete(false);
457+
c2.register_reference();
458+
EXPECT_EQ(c2.can_delete(), false);
459+
EXPECT_EQ(c2.reference_count(), 1);
460+
458461
event_man.addCall(c2);
459462

460463
auto const* calls_p_r_t_ptr = getElemPRT(pid,rid,tid,event_man.getCallListMap());
@@ -509,10 +512,30 @@ TEST(ADEvent, matchesEventsByCorrelationID){
509512
ExecData_t c_cpu_p2 = createFuncExecData_t(pid, rid, tid_cpu, 7, "cpu_launch_kernel_grandparent", 0, 300); //0-300
510513
bindParentChild(c_cpu_p2, c_cpu_p1);
511514

515+
std::cout << "c_cpu_p2 : " << c_cpu_p2.get_id().toString() << std::endl;
516+
std::cout << "c_cpu_p1 : " << c_cpu_p1.get_id().toString() << std::endl;
517+
std::cout << "c_cpu : " << c_cpu.get_id().toString() << std::endl;
518+
519+
520+
auto c_cpu_p2_it = event_man.addCall(c_cpu_p2);
512521

513-
event_man.addCall(c_cpu_p2);
514-
event_man.addCall(c_cpu_p1);
515-
event_man.addCall(c_cpu);
522+
//c_cpu should not be locked yet
523+
ASSERT_EQ(c_cpu_p2_it->reference_count(), 0);
524+
525+
auto c_cpu_p1_it = event_man.addCall(c_cpu_p1);
526+
527+
//This should lock both c_cpu_p2 and c_cpu_p1
528+
ASSERT_EQ(c_cpu_p2_it->reference_count(), 1);
529+
ASSERT_EQ(c_cpu_p1_it->reference_count(), 1);
530+
531+
auto c_cpu_it = event_man.addCall(c_cpu);
532+
533+
//c_cpu should be doubly locked
534+
ASSERT_EQ(c_cpu_it->reference_count(), 2);
535+
//c_cpu_p1 should be triply locked
536+
ASSERT_EQ(c_cpu_p1_it->reference_count(), 3);
537+
//c_cpu_p2 should be triply locked
538+
ASSERT_EQ(c_cpu_p2_it->reference_count(), 3);
516539

517540
//Ensure the correlation IDs got picked up
518541
EXPECT_EQ( event_man.getUnmatchCorrelationIDevents().size(), 3 );
@@ -589,6 +612,66 @@ TEST(ADEvent, matchesEventsByCorrelationID){
589612

590613

591614

615+
TEST(ADEvent, testCoridMatchChainUnlock){
616+
//Ensure that for a tree with multiple GPU events and common parents, when unlocking the tree for the first event
617+
//that we don't allow the common parents to get unlocked before the second event is matched
618+
619+
ADEvent event_man;
620+
int pid = 1;
621+
int rid = 2;
622+
int tid_cpu = 3;
623+
int tid_gpu = 4;
624+
int counter_id = 13; //index of "Correlation ID" counter
625+
626+
//First CPU event
627+
ExecData_t c_cpu_1 = createFuncExecData_t(pid, rid, tid_cpu, 4, "cpu_launch_kernel_1", 100, 200); //100-300
628+
c_cpu_1.add_counter(createCounterData_t(pid,rid,tid_cpu,counter_id, 1995, 100, "Correlation ID"));
629+
630+
ExecData_t c_gpu1 = createFuncExecData_t(pid, rid, tid_gpu, 6, "gpu_kernel1", 400, 100); //400-500
631+
c_gpu1.add_counter(createCounterData_t(pid,rid,tid_gpu,counter_id, 1995, 500, "Correlation ID"));
632+
633+
//Second CPU event
634+
ExecData_t c_cpu_2 = createFuncExecData_t(pid, rid, tid_cpu, 5, "cpu_launch_kernel_2", 150, 250); //150-250
635+
c_cpu_2.add_counter(createCounterData_t(pid,rid,tid_cpu,counter_id, 2020, 150, "Correlation ID"));
636+
637+
ExecData_t c_gpu2 = createFuncExecData_t(pid, rid, tid_gpu, 7, "gpu_kernel2", 500, 100); //500-600
638+
c_gpu2.add_counter(createCounterData_t(pid,rid,tid_gpu,counter_id, 2020, 600, "Correlation ID"));
639+
640+
//Common parent event
641+
ExecData_t c_cpu_p1 = createFuncExecData_t(pid, rid, tid_cpu, 8, "cpu_launch_kernel_parent", 50, 500); //50-550
642+
bindParentChild(c_cpu_p1, c_cpu_1);
643+
bindParentChild(c_cpu_p1, c_cpu_2);
644+
645+
auto c_cpu_p1_it = event_man.addCall(c_cpu_p1);
646+
auto c_cpu_1_it = event_man.addCall(c_cpu_1);
647+
auto c_cpu_2_it = event_man.addCall(c_cpu_2);
648+
649+
//Ensure the correlation IDs got picked up
650+
EXPECT_EQ( event_man.getUnmatchCorrelationIDevents().size(), 2 );
651+
652+
//Both the CPU and parent event should be locked
653+
EXPECT_EQ( c_cpu_p1_it->can_delete(), false );
654+
EXPECT_EQ( c_cpu_1_it->can_delete(), false );
655+
EXPECT_EQ( c_cpu_2_it->can_delete(), false );
656+
657+
658+
//Add the first GPU event
659+
auto c_gpu1_it = event_man.addCall(c_gpu1);
660+
661+
//Check correlation ID was matched
662+
EXPECT_EQ( event_man.getUnmatchCorrelationIDevents().size(), 1 );
663+
664+
//Check both gpu1 and cpu1 are unlocked
665+
EXPECT_EQ( c_cpu_1_it->can_delete(), true );
666+
EXPECT_EQ( c_gpu1_it->can_delete(), true );
667+
668+
//Check parent is *not* unlocked
669+
EXPECT_EQ( c_cpu_p1_it->can_delete(), false );
670+
}
671+
672+
673+
674+
592675

593676
TEST(ADEvent, testIteratorWindowDetermination){
594677
ADEvent event_man;

0 commit comments

Comments
 (0)