Skip to content

Commit cc527ba

Browse files
committed
sstSinker can now optionally dump the trace in ASCII format
sst_view_parse can now optionally locate unmatched correlation IDs
1 parent c9946b3 commit cc527ba

2 files changed

Lines changed: 133 additions & 17 deletions

File tree

app/sstSinker.cpp

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,39 @@
33
#include <chrono>
44
#include "chimbuko/util/commandLineParser.hpp"
55
#include "chimbuko/util/string.hpp"
6+
#include "chimbuko/util/error.hpp"
67

78
using namespace chimbuko;
89
using namespace std::chrono;
910

1011
struct SinkerArgs{
1112
int timeout;
1213
int beginstep_timeout;
13-
14-
SinkerArgs(): timeout(60), beginstep_timeout(30){}
14+
bool dump_events;
15+
16+
SinkerArgs(): timeout(60), beginstep_timeout(30), dump_events(false){}
1517
};
1618

19+
void dumpEvents(std::ostream &os, const std::vector<Event_t> &events, ADParser* parser){
20+
const std::unordered_map<int, std::string> &func_map = *parser->getFuncMap();
21+
const std::unordered_map<int, std::string> &counter_map = *parser->getCounterMap();
22+
const std::unordered_map<int, std::string> &event_type_map = *parser->getEventType();
23+
24+
for(const Event_t &e: events){
25+
if(e.type() == EventDataType::FUNC){
26+
auto eit = event_type_map.find(e.eid()); if(eit == event_type_map.end()) fatal_error("Could not find event type in map!");
27+
const std::string &etype = eit->second;
28+
auto fit = func_map.find(e.fid()); if(fit == func_map.end()) fatal_error("Could not find fid in map!");
29+
os << e.tid() << " " << e.ts() << " FUNC " << etype << " " << fit->second << std::endl;
30+
}else if(e.type() == EventDataType::COUNT){
31+
auto cit = counter_map.find(e.counter_id()); if(cit == counter_map.end()) fatal_error("Could not find counter id in map!");
32+
os << e.tid() << " " << e.ts() << " COUNT " << cit->second << " " << e.counter_value() << std::endl;
33+
}else if(e.type() == EventDataType::COMM){
34+
os << e.tid() << " " << e.ts() << " COMM " << e.partner() << " " << e.bytes() << std::endl;
35+
}
36+
}
37+
}
38+
1739

1840
int main(int argc, char ** argv){
1941
MPI_Init(&argc, &argv);
@@ -28,9 +50,10 @@ int main(int argc, char ** argv){
2850
commandLineParser<SinkerArgs> cmdline;
2951
addOptionalCommandLineArg(cmdline, timeout, "Specify the SST connect timeout in seconds (Default 60s)");
3052
addOptionalCommandLineArg(cmdline, beginstep_timeout, "Specify the SST beginStep timeout in seconds (Default 30s)");
31-
53+
addOptionalCommandLineArg(cmdline, dump_events, "Request that the parsed events be dumped to a file \"${BPFILENAME}.dump\". Requires \"fetch\" to be true. (Default false)");
54+
3255
if(argc < 5 || (argc == 2 && std::string(argv[1]) == "-help")){
33-
std::cout << "Usage: <exe> <engine type (BPFile, SST)> <bp directory> <bpfile prefix (eg tau-metrics-nwchem)> <fetch>\n"
56+
std::cout << "Usage: <exe> <engine type (BPFile, SST)> <bp directory> <bpfile prefix (eg tau-metrics-nwchem)> <fetch> <options>\n"
3457
<< "Where \"fetch\" indicates whether the data is actually transferred or we just iterate over the IO steps\n"
3558
<< "Options:" << std::endl;
3659
cmdline.help(std::cout);
@@ -46,6 +69,8 @@ int main(int argc, char ** argv){
4669
SinkerArgs args;
4770
cmdline.parse(args, argc-5, (const char**)(argv+5) );
4871

72+
if(args.dump_events && !fetch_data) fatal_error("dump_events option requires fetch=1");
73+
4974
if (world_rank == 0) {
5075
std::cout << "\n"
5176
<< "rank : " << world_rank << "\n"
@@ -79,6 +104,10 @@ int main(int argc, char ** argv){
79104

80105
parser->setBeginStepTimeout(args.beginstep_timeout);
81106

107+
//Initialize dump output
108+
std::ofstream *dump = nullptr;
109+
if(args.dump_events) dump = new std::ofstream(inputFile + ".dump");
110+
82111
// -----------------------------------------------------------------------
83112
// Start analysis
84113
// -----------------------------------------------------------------------
@@ -102,6 +131,9 @@ int main(int argc, char ** argv){
102131
parser->fetchFuncData();
103132
parser->fetchCommData();
104133
parser->fetchCounterData();
134+
135+
if(args.dump_events)
136+
dumpEvents(*dump, parser->getEvents(),parser);
105137
}
106138

107139
frames++;
@@ -140,6 +172,7 @@ int main(int argc, char ** argv){
140172
// Finalize
141173
// -----------------------------------------------------------------------
142174
delete parser;
175+
if(dump) delete dump;
143176
}
144177
catch (std::invalid_argument &e)
145178
{

app/sst_view_parse.pl

Lines changed: 96 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,41 @@
22
no warnings qw(experimental);
33
use feature qw(refaliasing);
44

5-
if(scalar @ARGV != 1){
6-
print "Usage: <script> <log file>\n";
5+
$ARGC=scalar @ARGV;
6+
7+
if($ARGC < 1){
8+
print "Usage: <script> <log file> <options>\n";
9+
print "Options:\n";
10+
print " -analyze_coridx Analyze unmatch correlation IDs\n";
711
exit;
812
}
913

1014
$logfile = $ARGV[0];
1115

16+
$ana_corid=0;
17+
for($i=1;$i<$ARGC;$i++){
18+
if($ARGV[$i] == "-analyze_coridx"){
19+
$ana_corid=1;
20+
print "Analyzing correlation IDs\n";
21+
}
22+
}
23+
24+
1225
print "Using log $logfile\n";
1326

1427
open(IN, $logfile);
1528
@lines = <IN>;
1629
close(IN);
1730

18-
$active = 0;
19-
$n = 0;
20-
$nrows = 0;
31+
$event_active = 0;
32+
$event_n = 0;
33+
$event_nrows = 0;
34+
35+
$count_active = 0;
36+
$count_n = 0;
37+
$count_nrows = 0;
38+
39+
$corid_cidx=-1; #index of correlation ID counter
2140

2241
%funcid_name_map = ();
2342
%gputhread_info = ();
@@ -26,6 +45,8 @@
2645
#Where data array has the format [ (state (0 inactive, 1 active)), (timestamp), [(elapsed times)] ]
2746
%funcs = ();
2847

48+
%corids = ();
49+
$ncorid_matched=0;
2950

3051
for($i=0;$i<scalar @lines;$i++){
3152
$line = $lines[$i];
@@ -37,6 +58,12 @@
3758
$funcid_name_map{$1} = $2;
3859
#print "Found fid map: $1 : $2\n";
3960

61+
#Parse correlation ID counter index
62+
#FOUND NEW ATTRIBUTE: counter 0 {Elements:1 Type:string Value:"Correlation ID" }
63+
}elsif($line=~m/counter\s(\d+)\s.*Value:\"Correlation ID\"/){
64+
$corid_cidx=$1;
65+
66+
4067
#Parse GPU device/context
4168
#FOUND NEW ATTRIBUTE: MetaData:0:7:CUDA Context {Elements:1 Type:string Value:"1" }
4269
#FOUND NEW ATTRIBUTE: MetaData:0:7:CUDA Device {Elements:1 Type:string Value:"0" }
@@ -52,13 +79,13 @@
5279

5380
#Parse open of event timestamps list for this timestep
5481
}elsif($line=~m/event_timestamps.*Parsed\ssize\s(\d+)/){
55-
$nrows = $1;
56-
#print "Rows $nrows\n";
57-
$active = 1;
58-
$n = 0;
82+
$event_nrows = $1;
83+
#print "Rows $event_nrows\n";
84+
$event_active = 1;
85+
$event_n = 0;
5986

6087
#Parse an entry in the event_timestamps array
61-
}elsif($active == 1){
88+
}elsif($event_active == 1){
6289
#print "Active: $line\n";
6390
#Example line format
6491
#(45 0 ):0 (45 1 ):0 (45 2 ):5 (45 3 ):1 (45 4 ):21 (45 5 ):1585852643242747
@@ -106,10 +133,47 @@
106133
#print "Found exit from func $func thread $thread: elapsed $elapsed\n";
107134
}
108135

109-
$n++;
110-
if($n == $nrows){
111-
$active = 0;
136+
$event_n++;
137+
if($event_n == $event_nrows){
138+
$event_active = 0;
139+
}
140+
141+
#Counter timestamps
142+
}elsif($line=~m/counter_values.*Parsed\ssize\s(\d+)/){
143+
$counter_nrows = $1;
144+
$counter_active = 1;
145+
$counter_n = 0;
146+
147+
#Parse an entry in the counter_values array
148+
}elsif($counter_active == 1){
149+
#print "Active: $line\n";
150+
#Example line format
151+
#(1 0 ):0 (1 1 ):0 (1 2 ):0 (1 3 ):0 (1 4 ):2 (1 5 ):1629924204499736
152+
#pid rid tid cidx cval ts
153+
154+
if(!($line=~m/2\s\)\:(\d+)\s\(\d+\s3\s\)\:(\d+)\s\(\d+\s4\s\)\:(\d+)\s\(\d+\s5\s\)\:(\d+)/)){
155+
print "ERR\n";
156+
exit;
157+
}
158+
$thread = $1;
159+
$cid = $2;
160+
$cval = $3;
161+
$ts = $4;
162+
163+
if(exists($corids{$cval})){
164+
print "Found matching corid $cval\n";
165+
delete $corids{$cval};
166+
$ncorid_matched++;
167+
}else{
168+
print "Found new corid $cval\n";
169+
$corids{$cval} = [$thread, $cid, $cval, $ts];
112170
}
171+
172+
$counter_n++;
173+
if($counter_n == $counter_nrows){
174+
$counter_active = 0;
175+
}
176+
113177
}
114178

115179
}
@@ -157,3 +221,22 @@
157221
}
158222
}
159223
}
224+
225+
if($ana_corid == 1){
226+
if($corid_cidx==-1){
227+
print "Could not find correlation ID counter index!\n";
228+
exit 1;
229+
}
230+
print "Correlation ID counter index is $corid_cidx\n";
231+
$nunmatched = keys %corids;
232+
print "Found $ncorid_matched matched and $nunmatched unmatched correlation IDs\n";
233+
234+
foreach $c (keys %corids){
235+
\@data = $corids{$c};
236+
$thread = $data[0];
237+
$ts = $data[3];
238+
print "$c with timestamp $ts on thread $thread\n";
239+
}
240+
241+
242+
}

0 commit comments

Comments
 (0)