Skip to content

Commit 0b03c48

Browse files
authored
Merge pull request #27 from CODARcode/sm_release
Fix for HBOS and COPOD: Handling cases when the number of function runtimes in each IO step is Zero (XGC run)
2 parents ccc9226 + 91eb0de commit 0b03c48

8 files changed

Lines changed: 256 additions & 73 deletions

File tree

include/chimbuko/ad/ADOutlier.hpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ namespace chimbuko {
238238

239239
private:
240240
double m_alpha; /**< Used to prevent log2 overflow */
241-
double m_threshold; /**< Threshold used to filter anomalies in HBOS*/
241+
double m_threshold; /**< Threshold used to filter anomalies in COPOD*/
242242
bool m_use_global_threshold; /**< Flag to use global threshold*/
243243
//double m_threshold; /** sync with global threshold */
244244
OutlierStatistic m_statistic; /** Which statistic to use for outlier detection */
@@ -253,13 +253,13 @@ namespace chimbuko {
253253
public:
254254

255255
/**
256-
* @brief Construct a new ADOutlierHBOS object
256+
* @brief Construct a new ADOutlierCOPOD object
257257
*
258258
*/
259259
ADOutlierCOPOD(OutlierStatistic stat = ExclusiveRuntime, double threshold = 0.99, bool use_global_threshold = true);
260260

261261
/**
262-
* @brief Destroy the ADOutlierHBOS object
262+
* @brief Destroy the ADOutlierCOPOD object
263263
*
264264
*/
265265
~ADOutlierCOPOD();
@@ -308,11 +308,12 @@ namespace chimbuko {
308308

309309
private:
310310
double m_alpha; /**< Used to prevent log2 overflow */
311-
double m_threshold; /**< Threshold used to filter anomalies in HBOS*/
311+
double m_threshold; /**< Threshold used to filter anomalies in COPOD*/
312312
bool m_use_global_threshold; /**< Flag to use global threshold*/
313-
//double m_threshold; /** sync with global threshold */
314-
OutlierStatistic m_statistic; /** Which statistic to use for outlier detection */
313+
//double m_threshold; /**< sync with global threshold */
314+
OutlierStatistic m_statistic; /**< Which statistic to use for outlier detection */
315315

316+
std::unordered_map<unsigned long, int> m_skewness; /**< skewness for each function*/
316317
};
317318

318319

scripts/launch/run_services.sh

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,18 +61,18 @@ var_dir=${base}/chimbuko/vars
6161
bp_dir=${base}/chimbuko/adios2
6262

6363
#Check tau adios2 path is writable; by default this is ${bp_dir} but it can be overridden by users, eg for offline analysis
64-
touch ${TAU_ADIOS2_PATH}/write_check
65-
if [[ $? == 1 ]]; then
66-
echo "Chimbuko Services: Could not write to ADIOS2 output path ${TAU_ADIOS2_PATH}, check permissions"
67-
exit 1
68-
fi
69-
rm -f ${TAU_ADIOS2_PATH}/write_check
64+
#touch ${TAU_ADIOS2_PATH}/write_check
65+
#if [[ $? == 1 ]]; then
66+
# echo "Chimbuko Services: Could not write to ADIOS2 output path ${TAU_ADIOS2_PATH}, check permissions"
67+
# exit 1
68+
#fi
69+
#rm -f ${TAU_ADIOS2_PATH}/write_check
7070

7171
#Get head node IP
7272
if command -v ip &> /dev/null
7373
then
7474
ip=$(ip -4 addr show ${service_node_iface} | grep -oP '(?<=inet\s)\d+(\.\d+){3}')
75-
elif command -v iconfig &> /dev/null
75+
elif command -v ifconfig &> /dev/null
7676
then
7777
ip=$(ifconfig 2>&1 ${service_node_iface} | grep -E -o 'inet [0-9.]+' | awk '{print $2}')
7878
else

scripts/provdb_analyze.py

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,19 @@ def generateIndex(interface, keys, collection):
8989

9090
return results
9191

92+
#Write the index to disk
93+
def writeIndex(filename, index):
94+
f = open(filename,"w")
95+
json.dump(index, f)
96+
f.close()
97+
98+
#Read the index from disk
99+
def readIndex(filename):
100+
f = open(filename,"r")
101+
index = json.load(f)
102+
f.close()
103+
return index
104+
92105
#Get an event using the index tuple obtained from the provided index
93106
def getEventByID(interface, index, idx_tuple):
94107
collection = index['__collection']
@@ -119,15 +132,53 @@ def getRecordsByKeyValue(interface, index, key, value, sort_key = 'entry'):
119132
lst.sort(key=lambda x: x[sort_key])
120133
return lst
121134

122-
#Generate a string containing a summary of an event: (process, rank, thread, function name, runtime, runtime/avg)
135+
#Generate a string containing a summary of an event: (process, rank, thread, function name, step, exclusive runtime, inclusive runtime, outlier score, outlier severity)
123136
#If the event was a GPU event, the thread index will be replaced by GPU${DEVICE}/${CONTEXT}/${STREAM}
124137
def summarizeEvent(event):
125138
thr_str = "{}".format(event['tid'])
126139
if event['is_gpu_event']:
127140
thr_str = "GPU{}/{}/{}".format(event['gpu_location']['device'],event['gpu_location']['context'],event['gpu_location']['stream'])
128-
return "{} {} {} {} {}s {}".format(event['pid'],event['rid'], thr_str, event['func'], float(event['runtime_exclusive'])/1e6, float(event['runtime_exclusive'])/float(event['func_stats']['mean']) )
129-
130-
141+
return "pid={} rid={} tid={} func=\"{}\" step={} excl={}s tot={}s score={} severity={}".format(event['pid'],event['rid'], thr_str, event['func'], event['io_step'], float(event['runtime_exclusive'])/1e6, float(event['runtime_total'])/1e6, event['outlier_score'], event['outlier_severity'])
142+
143+
144+
#Get the function profile information for application index 'app'
145+
#Returns a dictionary indexed by the function name with entries
146+
# 'excl_time_tot' : the total exclusive time (in seconds) spent in the function over all ranks and threads
147+
# 'count' : the number of times the function was encountered over all ranks and threads
148+
# 'excl_time_mean' : the average exclusive time (in seconds) spent in the function over all occurences
149+
# 'frac_time' : the amount of time spent in the function over all occurences relative to the total runtime spent in all threads and ranks
150+
def getFunctionProfile(interface, app):
151+
profile = interface.getGlobalDB().filter('func_stats', "function($f){return $f[\"app\"] == %d;}" % app)
152+
total_time = 0 #sum of exclusive times is total time
153+
fprofile = {}
154+
for f in profile:
155+
#Don't currently sum times but can reconstruct from count and mean
156+
j = json.loads(f)
157+
count = int(j['exclusive']['count'])
158+
excl_mean = float(j['exclusive']['mean']) / 1e6 #seconds
159+
excl_tot = count * excl_mean
160+
fname = j['name']
161+
fprofile[fname] = {}
162+
fprofile[fname]['excl_time_tot'] = excl_tot
163+
fprofile[fname]['count'] = count
164+
fprofile[fname]['excl_time_mean'] = excl_mean
165+
total_time += excl_tot
166+
167+
168+
for f in fprofile.keys():
169+
fprofile[f]['frac_time'] = fprofile[f]['excl_time_tot']/total_time
170+
171+
return fprofile
172+
173+
#Print summary information for each function sorted by the total exclusive runtime spent in the function
174+
def summarizeProfile(fprofile):
175+
fordered = sorted(fprofile.keys(), key=lambda x: fprofile[x]['excl_time_tot'], reverse=True)
176+
for f in fordered:
177+
finfo = fprofile[f]
178+
print("(name='%s') (total excl. time=%es) (count=%d) (avg. excl. time=%es) (runtime fraction=%e)" % (f,finfo['excl_time_tot'],finfo['count'],finfo['excl_time_mean'],finfo['frac_time']) )
179+
180+
181+
131182
if __name__ == '__main__':
132183
argc = len(sys.argv)
133184
if(argc != 2):

scripts/provdb_interact.py

Lines changed: 76 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@
1212
from pysonata.client import SonataClient
1313
from pysonata.admin import SonataAdmin
1414

15-
class provDBshard:
16-
def __openCollection(self, coll_name, create=False):
15+
#Base class for a Sonata client connection to a database
16+
class provDBclientBase:
17+
def _openCollection(self, coll_name, create=False):
1718
if(self.database.exists(coll_name) == False):
1819
if(create == True):
1920
print("Shard %s creating '%s' collection" % (self.db_name, coll_name) )
@@ -23,16 +24,28 @@ def __openCollection(self, coll_name, create=False):
2324
sys.exit(1)
2425
return self.database.open(coll_name)
2526

27+
#Apply a jx9 filter to a collection
28+
def filter(self, which_coll, query):
29+
return self.getCollection(which_coll).filter(query)
30+
31+
def execute(self, code, variables):
32+
return self.database.execute(code,variables)
2633

34+
def store(self, which_coll, record):
35+
return self.getCollection(which_coll).store(record, commit=True)
36+
37+
38+
#Sonata client connection to a database shard
39+
class provDBshard(provDBclientBase):
2740
def __init__(self,client,address,db_name,create=False):
2841
#Open database as a client
2942
self.db_name = db_name
3043
self.database = client.open(address, 0, db_name)
3144

3245
#Initialize collections
33-
self.anomalies = self.__openCollection('anomalies',create=create)
34-
self.normalexecs = self.__openCollection('normalexecs',create=create)
35-
self.metadata = self.__openCollection('metadata',create=create)
46+
self.anomalies = self._openCollection('anomalies',create=create)
47+
self.normalexecs = self._openCollection('normalexecs',create=create)
48+
self.metadata = self._openCollection('metadata',create=create)
3649

3750
def __del__(self):
3851
del self.anomalies
@@ -53,16 +66,34 @@ def getCollection(self, which_coll):
5366
sys.exit(1)
5467
return col
5568

56-
#Apply a jx9 filter to a collection
57-
def filter(self, which_coll, query):
58-
return self.getCollection(which_coll).filter(query)
5969

60-
def execute(self, code, variables):
61-
return self.database.execute(code,variables)
6270

63-
def store(self, which_coll, record):
64-
return self.getCollection(which_coll).store(record, commit=True)
65-
71+
class provDBglobal(provDBclientBase):
72+
def __init__(self,client,address,db_name,create=False):
73+
#Open database as a client
74+
self.db_name = db_name
75+
self.database = client.open(address, 0, db_name)
76+
77+
#Initialize collections
78+
self.func_stats = self._openCollection('func_stats',create=create)
79+
self.counter_stats = self._openCollection('counter_stats',create=create)
80+
81+
def __del__(self):
82+
del self.func_stats
83+
del self.counter_stats
84+
85+
def getCollection(self, which_coll):
86+
col = None
87+
if which_coll == "func_stats":
88+
col = self.func_stats
89+
elif which_coll == "counter_stats":
90+
col = self.counter_stats
91+
else:
92+
print("Invalid collection")
93+
sys.exit(1)
94+
return col
95+
96+
6697

6798

6899
class provDBinterface:
@@ -99,6 +130,27 @@ def __init__(self,engine,filename,nshards,create=False):
99130

100131
self.db_shards.append( provDBshard(self.client, self.address, db_name, create=create) )
101132

133+
#Connect to global database if available
134+
db_name = "provdb.global"
135+
self.db_global_name = db_name
136+
db_file = re.sub(r'\%d',"global",filename)
137+
if os.path.exists(db_file) == False:
138+
if create == True:
139+
print("Creating global database as %s from file %s" % (db_name,db_file))
140+
self.admin.create_database(self.address, 0, db_name, 'unqlite', "{ \"path\" : \"%s\" }" % db_file)
141+
self.use_global_db = True
142+
else:
143+
self.use_global_db = False
144+
else:
145+
print("Attaching global database as %s from file %s" % (db_name,db_file))
146+
self.admin.attach_database(self.address, 0, db_name, 'unqlite', "{ \"path\" : \"%s\" }" % db_file)
147+
self.use_global_db = True
148+
149+
if self.use_global_db:
150+
self.db_global = provDBglobal(self.client, self.address, db_name, create=create)
151+
else:
152+
self.db_global = None
153+
102154
def getNshards(self):
103155
return len(self.db_shards)
104156

@@ -108,11 +160,20 @@ def getShard(self, i):
108160
def getShards(self):
109161
return self.db_shards
110162

163+
#Get the global database client. Returns None if the global database doesn't exit
164+
def getGlobalDB(self):
165+
return self.db_global
166+
111167
def __del__(self):
112-
del self.db_shards
113-
del self.client
168+
del self.db_shards
114169
for n in self.db_names:
115170
self.admin.detach_database(self.address, 0, n)
171+
172+
if self.use_global_db:
173+
del self.db_global
174+
self.admin.detach_database(self.address, 0, self.db_global_name)
175+
176+
del self.client
116177
del self.admin
117178
del self.provider
118179

0 commit comments

Comments
 (0)