Skip to content

Commit bd4ef29

Browse files
committed
It seems that with CUDA 12.4 and the latest tau that GPU correlation IDs now appear associated with the kernel exit rather than entry. To combat this, I disabled the special-case ordering for correlation ID event parsing. Will need to revisit this issue for other versions / GPU APIs
Updated default config script for benchmark_suite/cupti_gpu_kernel_outlier
1 parent 408dd2b commit bd4ef29

2 files changed

Lines changed: 35 additions & 9 deletions

File tree

benchmark_suite/cupti_gpu_kernel_outlier/chimbuko_config.sh

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
#IMPORTANT NOTE: Variables that cannot be left as default are marked as <------------ ***SET ME***
55

6-
service_node_iface=eth0 #network interface upon which communication to the service node is performed <------------ ***SET ME***
6+
module="performance_analysis"
77

88
####################################
99
#Options for visualization module
@@ -27,13 +27,31 @@ use_provdb=1 #enable or disable the provDB. If disabled the provenance data will
2727
provdb_extra_args="" #any extra command line arguments to pass
2828
provdb_nshards=4 #number of database shards
2929
provdb_ninstances=1 #number of database server instances. Shards are distributed over instances
30-
provdb_engine="ofi+tcp;ofi_rxm" #the OFI libfabric provider used for the Mochi stack
30+
provdb_engine="sockets" #the OFI libfabric provider used for the Mochi stack
3131
provdb_port=5000 #the port of the provenance database
3232
provdb_writedir=chimbuko/provdb #the directory in which the provenance database is written. Chimbuko creates chimbuko/provdb which can be used as a default
3333
provdb_commit_freq=10000 #frequency ms at which the provenance database is committed to disk. If set to 0 it will commit only at the end
3434

35-
#With "verbs" provider (used for infiniband, iWarp, etc) we need to also specify the domain, which can be found by running fi_info (on a compute node)
36-
provdb_domain=mlx5_0 #only needed for verbs provider <------------ ***SET ME (if using verbs)***
35+
#provdb_interface : network interface upon which communication to the provdb is performed. <------------ ***SET ME***
36+
# This variable has several options:
37+
# auto - let Mercury automatically choose an interface for all instances
38+
# <iface> - a single interface used for all instances
39+
# <iface1>:<iface2>:<iface3> .... - a colon-separated list of interfaces, one per instance
40+
# Obtain a list of interfaces from, e.g. "ip link show" (cf https://www.cyberciti.biz/faq/linux-list-network-interfaces-names-command/).
41+
provdb_interface=auto
42+
43+
#provdb_domain : With "verbs" provider (used for infiniband, iWarp, etc) we need to also specify the domain, which can be found by running fi_info (on a compute node)
44+
# If left blank it will be chosen automatically. <------------ ***SET ME (if using verbs)***
45+
provdb_domain=
46+
47+
#provdb_numa_bind : specify NUMA domain binding for the provdb instances (requires numactl)
48+
# This variable has several options:
49+
# <blank> - if left blank, no binding will be performed
50+
# <index> - a single NUMA domain for all instances
51+
# <idx1>:<idx2>:<idx3> ... - a colon-separated list of NUMA domains, one per instance
52+
provdb_numa_bind=
53+
54+
commit_extra_args="" #extra arguments for the committer
3755

3856
export FI_UNIVERSE_SIZE=1600 # Defines the expected number of provenance DB clients per instance <------------- *** SET ME (should be larger than the number of clients/instance)
3957
export FI_MR_CACHE_MAX_COUNT=0 # disable MR cache in libfabric; still problematic as of libfabric 1.10.1
@@ -44,14 +62,17 @@ export FI_OFI_RXM_USE_SRX=1 # use shared recv context in RXM; should improve sca
4462
####################################
4563
use_pserver=1 #enable or disable the pserver
4664
pserver_extra_args="" #any extra command line arguments to pass
65+
pserver_interface=eth0 #network interface upon which communication to the pserver is performed. Obtain from, e.g. "ip link show" (cf https://www.cyberciti.biz/faq/linux-list-network-interfaces-names-command/). <------------ ***SET ME***
4766
pserver_port=5559 #port for parameter server
4867
pserver_nt=2 #number of worker threads
68+
pserver_numa_bind= #specify NUMA domain binding for the pserver (requires numactl). If left blank, no binding will be performed
69+
4970
####################################
5071
#Options for the AD module
5172
####################################
5273
ad_extra_args="-perf_outputpath chimbuko/logs -perf_step 1" #any extra command line arguments to pass. Note: chimbuko/logs is automatically created by services script
5374
ad_win_size=5 #number of events around an anomaly to store; provDB entry size is proportional to this so keep it small!
54-
ad_alg="sstd" #the anomaly detection algorithm. Valid values are "hbos" and "sstd"
75+
ad_alg="hbos" #the anomaly detection algorithm. Valid values are "hbos" and "sstd"
5576
ad_outlier_hbos_threshold=0.99 #the percentile of events outside of which are considered anomalies by the HBOS algorithm
5677
ad_outlier_sstd_sigma=12 #number of standard deviations that defines an outlier in the SSTD algorithm
5778
####################################
@@ -60,20 +81,22 @@ ad_outlier_sstd_sigma=12 #number of standard deviations that defines an outlier
6081
# command to launch the AD (output to chimbuko/vars/chimbuko_ad_cmdline.var); they can be overridden by the run script if desired providing the appropriate modifications
6182
# are made to the AD launch command. The remainder of the variables are used only by TAU and can be freely overridden.
6283
####################################
63-
export TAU_ADIOS2_ENGINE=SST #online communication engine (alternative BP4 although this goes through the disk system and may be slower unless the BPfiles are stored on a burst disk)
84+
export TAU_ADIOS2_ENGINE=BP4 #online communication engine (alternative BP4 although this goes through the disk system and may be slower unless the BPfiles are stored on a burst disk)
6485
export TAU_ADIOS2_ONE_FILE=FALSE #a different connection file for each rank
6586
export TAU_ADIOS2_PERIODIC=1 #enable/disable ADIOS2 periodic output
6687
export TAU_ADIOS2_PERIOD=1000000 #period in us between ADIOS2 io steps
6788
export TAU_THREAD_PER_GPU_STREAM=1 #force GPU streams to appear as different TAU virtual threads
6889
export TAU_THROTTLE=0 #enable/disable throttling of short-running functions
6990

70-
export TAU_MAKEFILE=/opt/tau2/x86_64/lib/Makefile.tau-papi-mpi-pthread-python-cupti-pdt-adios2 #The TAU makefile to use <------------ ***SET ME***
91+
#export TAU_MAKEFILE=/opt/tau2/x86_64/lib/Makefile.tau-papi-mpi-pthread-python-cupti-pdt-adios2 #The TAU makefile to use <------------ ***SET ME***
92+
93+
tau_monitoring_conf="default" #Provide a configuration file for the TAU monitoring plugin. It will be copied to the work directory as "tau_monitoring.json" (unless it is already there!). If set to default, Chimbuko will generate one automatically
7194

7295
#Note: the following 2 variables are not used by the service script but are included here for use from the user's run script allowing the application to be launched with either "${TAU_EXEC} <app>" or "${TAU_PYTHON} <app>"
7396
#Note: the "binding" -T ... is used by Tau to find the appropriate configuration. It can typically be inferred from the name of the Makefile. If using a non-MPI job the 'mpi' should be changed to 'serial' and a non-MPI build of
7497
# ADIOS2/TAU must exist
7598
#Suggestion: It is useful to test the command without Chimbuko first to ensure TAU picks up the correct binding; this can be done by 'export TAU_ADIOS2_ENGINE=BPFile' and then running the application with Tau but without Chimbuko.
76-
TAU_EXEC="tau_exec -T papi,mpi,pthread,python,cupti,pdt,adios2 -adios2_trace -cupti -um" #how to execute tau_exec; the -T arguments should mirror the makefile name <------------ ***SET ME***
99+
TAU_EXEC="tau_exec -T papi,mpi,pthread,cupti,pdt,adios2 -cupti -adios2_trace -monitoring" #how to execute tau_exec; the -T arguments should mirror the makefile name <------------ ***SET ME*** #-um
77100
TAU_PYTHON="tau_python -T papi,mpi,pthread,python,cupti,pdt,adios2 -tau-python-interpreter=python3 -adios2_trace -tau-python-args=-u" #how to execute tau_python. Note that passing -u to python forces it to not buffer stdout so we can pipe it
78101
#to tee in realtime <--- SET ME (if !python3)
79102

src/modules/performance_analysis/ad/ADParser.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -707,7 +707,10 @@ std::vector<Event_t> ADParser::getEvents() const{
707707
}else if(func_event_type == EXIT && counter_is_correlation_id){
708708
//Correlation IDs are associated only with function ENTRY events; if an EXIT event coincides with a CorrelationID the EXIT takes priority over counter events
709709
//Functions are still greedy with comm events however
710-
SET_PRIO(COMM, FUNC, COUNTER);
710+
//SET_PRIO(COMM, FUNC, COUNTER);
711+
712+
//EDIT 4/10/25: This no longer appears to be the case; at least for my recent testing on CUDA 12.4 and recent TAU, the correlation ID is associated with the function exit. To check, run sstSinker on a GPU program and look where the kernel-side correlation ID appears relative to the stimestamps. Will need to check this on other versions / ROCM.
713+
SET_PRIO(COMM, COUNTER, FUNC);
711714
}else{
712715
//Otherwise funcData takes lowest priority so comm and counter events are included in the function execution
713716
SET_PRIO(COMM, COUNTER, FUNC);

0 commit comments

Comments
 (0)