Skip to content

Commit 5354a6c

Browse files
committed
Made a significant performance improvement in Histogram merge by avoiding repeated bin search from the start of the linked list. This is implemented through a new function extractUniformCountInRangesInt that takes an ordered array of edges
Fixed error-utility unit test failing because of extra stack information Updated tau version in Dockerfile Added a benchmark of just the histogram merge operation
1 parent 81a4f42 commit 5354a6c

9 files changed

Lines changed: 351 additions & 20 deletions

File tree

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
benchmark: benchmark.cpp
2+
mpic++ -std=c++17 -O3 -g -I/src/develop2/PerformanceAnalysis/test/unit_tests -I/install/AD/develop2/include -I/install/AD/develop2/include/chimbuko/3rdparty -L/install/AD/develop2/lib benchmark.cpp -o benchmark -lchimbuko -lstdc++fs
3+
benchmark_tau: benchmark.cpp
4+
tau_cxx.sh -std=c++17 -O3 -g -I/src/develop2/PerformanceAnalysis/test/unit_tests -I/install/AD/develop2/include -I/install/AD/develop2/include/chimbuko/3rdparty -L/install/AD/develop2/lib benchmark.cpp -o benchmark -lchimbuko -lstdc++fs
5+
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
//A fake AD that sends data to the pserver at a regular cadence
2+
#include<mpi.h>
3+
#include<chimbuko/ad/ADNetClient.hpp>
4+
#include<chimbuko/ad/utils.hpp>
5+
#include<chimbuko/util/commandLineParser.hpp>
6+
#include<chimbuko/param/sstd_param.hpp>
7+
#include<chimbuko/param/hbos_param.hpp>
8+
#include<chimbuko/param/copod_param.hpp>
9+
#include<chimbuko/ad/ADEvent.hpp>
10+
#include<chimbuko/util/Anomalies.hpp>
11+
#include<chimbuko/ad/ADLocalFuncStatistics.hpp>
12+
#include<chimbuko/ad/ADLocalCounterStatistics.hpp>
13+
#include<chimbuko/ad/ADLocalAnomalyMetrics.hpp>
14+
#include<chimbuko/ad/ADcombinedPSdata.hpp>
15+
#include<chimbuko/ad/ADglobalFunctionIndexMap.hpp>
16+
#include<chimbuko/verbose.hpp>
17+
#include "gtest/gtest.h"
18+
#include<unit_test_common.hpp>
19+
20+
using namespace chimbuko;
21+
22+
struct Args{
23+
int nfuncs;
24+
std::string algorithm;
25+
int hbos_bins;
26+
int cycles;
27+
28+
Args(){
29+
nfuncs = 100;
30+
algorithm = "hbos";
31+
hbos_bins = 20;
32+
cycles = 10000;
33+
}
34+
};
35+
36+
int main(int argc, char **argv){
37+
MPI_Init(&argc, &argv);
38+
39+
if(const char* env_p = std::getenv("CHIMBUKO_VERBOSE")){
40+
std::cout << "Enabling verbose debug output" << std::endl;
41+
enableVerboseLogging() = true;
42+
}
43+
44+
commandLineParser<Args> cmdline;
45+
addOptionalCommandLineArgDefaultHelpString(cmdline, cycles);
46+
addOptionalCommandLineArgDefaultHelpString(cmdline, nfuncs);
47+
addOptionalCommandLineArgDefaultHelpString(cmdline, algorithm); //algorithm, default "sstd"
48+
addOptionalCommandLineArgDefaultHelpString(cmdline, hbos_bins); //default 20
49+
50+
if(argc == 1 || (argc == 2 && std::string(argv[1]) == "-help")){
51+
cmdline.help();
52+
MPI_Finalize();
53+
return 0;
54+
}
55+
56+
Args args;
57+
cmdline.parse(args, argc-1, (const char**)(argv+1));
58+
59+
//Set up a params object with the required number of params
60+
ParamInterface *params;
61+
if(args.algorithm == "sstd"){
62+
SstdParam *p = new SstdParam;
63+
for(int i=0;i<args.nfuncs;i++){
64+
RunStats &r = (*p)[i];
65+
for(int j=0;j<100;j++)
66+
r.push(double(j));
67+
}
68+
params = p;
69+
}else if(args.algorithm == "hbos" || args.algorithm == "copod"){
70+
std::vector<unsigned int> counts(args.hbos_bins);
71+
for(int i=0;i<args.hbos_bins;i++) counts[i] = i;
72+
Histogram hd;
73+
hd.set_histogram(counts, 0.0001, args.hbos_bins, 0, 1);
74+
75+
if(args.algorithm == "hbos"){
76+
HbosParam *p = new HbosParam;
77+
for(int i=0;i<args.nfuncs;i++) (*p)[i].getHistogram() = hd;
78+
params = p;
79+
}else{
80+
CopodParam *p = new CopodParam;
81+
for(int i=0;i<args.nfuncs;i++) (*p)[i].getHistogram() = hd;
82+
params = p;
83+
}
84+
}else{
85+
fatal_error("Unknown AD algorithm");
86+
}
87+
88+
PerfTimer cyc_timer;
89+
PerfTimer timer;
90+
91+
//To make the benchmark as lightweight as possible, precompute the messages and send the same each cycle
92+
std::string params_msg = params->serialize();
93+
RunStats r;
94+
95+
for(int c=0;c<args.cycles;c++){
96+
cyc_timer.start();
97+
std::cout << c << std::endl;
98+
99+
timer.start();
100+
params->update(params_msg, false);
101+
r.push(timer.elapsed_ms());
102+
}//cycle loop
103+
std::cout << r.mean() << " " << r.stddev() << std::endl;
104+
105+
delete params;
106+
MPI_Finalize();
107+
108+
return 0;
109+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/bin/bash
2+
3+
ulimit -c unlimited
4+
set -e
5+
set -o pipefail
6+
7+
8+
exe=./benchmark
9+
cycles=100
10+
nfuncs=100
11+
algorithm="hbos"
12+
hbos_bins=200
13+
14+
export OMPI_ALLOW_RUN_AS_ROOT=1
15+
export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
16+
17+
export TAU_VERBOSE=1
18+
#client_cmd="mpirun -n 1 valgrind --tool=callgrind $exe -cycles ${cycles} -nfuncs ${nfuncs} -algorithm ${algorithm} -hbos_bins ${hbos_bins} 2>&1 | tee run.log"
19+
client_cmd="mpirun -n 1 $exe -cycles ${cycles} -nfuncs ${nfuncs} -algorithm ${algorithm} -hbos_bins ${hbos_bins} 2>&1 | tee run.log"
20+
#export CHIMBUKO_VERBOSE=1
21+
#
22+
23+
echo "Instantiating client"
24+
echo "Command is " $client_cmd
25+
eval "${client_cmd}"
26+
27+
echo "Done"
28+
date

docker/ubuntu18.04/openmpi4.0.4/Dockerfile.tau2

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,11 @@ WORKDIR /Downloads
3333

3434
#Download and install tau from git
3535
#Change the tag below to force docker build not to use cache
36-
RUN echo "FORCE REDOWNLOAD 12_15_22_10_38" > /dev/null && git clone https://github.com/UO-OACISS/tau2.git
36+
RUN echo "FORCE REDOWNLOAD 5_31_23_11_08" > /dev/null && git clone https://github.com/UO-OACISS/tau2.git
3737
WORKDIR /Downloads/tau2
3838

3939
#Use known working version of tau2
40-
RUN git checkout 39b2d014c6bd67ff62f993cfca30e62081caa64d
40+
RUN git checkout 5a42474f8e8efb0d3382eeec43cd14137088181c
4141

4242
#pthread+mpi
4343
RUN ./configure -cc=mpicc -c++=mpicxx -fortran=gfortran -mpi \

docker/ubuntu18.04/openmpi4.0.4/docker_rebuild_stack.sh

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22

33
set -e
44

5-
docker build -f Dockerfile.base -t chimbuko/base:ubuntu18.04 .
6-
docker build -f Dockerfile.adios2 -t chimbuko/adios2:ubuntu18.04 .
7-
docker build -f Dockerfile.mochi -t chimbuko/mochi:ubuntu18.04 .
8-
docker build -f Dockerfile.tau2 -t chimbuko/tau2:ubuntu18.04 .
9-
docker build -f Dockerfile.ad.provdb -t chimbuko/ad:ubuntu18.04-provdb .
5+
#docker build -f Dockerfile.base -t chimbuko/base:ubuntu18.04 .
6+
#docker build -f Dockerfile.adios2 -t chimbuko/adios2:ubuntu18.04 .
7+
#docker build -f Dockerfile.mochi -t chimbuko/mochi:ubuntu18.04 .
8+
#docker build -f Dockerfile.tau2 -t chimbuko/tau2:ubuntu18.04 .
9+
#docker build -f Dockerfile.ad.provdb -t chimbuko/ad:ubuntu18.04-provdb .
1010
docker build -f Dockerfile.viz -t chimbuko/viz:ubuntu18.04 .
11-
docker build -f Dockerfile.nwchem -t chimbuko/nwchem:ubuntu18.04-provdb .
12-
docker build -f Dockerfile.chimbuko.nwchem -t chimbuko/run_nwchem:ubuntu18.04-provdb .
13-
docker build -f Dockerfile.chimbuko.benchmark_suite -t chimbuko/run_examples:ubuntu18.04-provdb .
11+
#docker build -f Dockerfile.nwchem -t chimbuko/nwchem:ubuntu18.04-provdb .
12+
#docker build -f Dockerfile.chimbuko.nwchem -t chimbuko/run_nwchem:ubuntu18.04-provdb .
13+
#docker build -f Dockerfile.chimbuko.benchmark_suite -t chimbuko/run_examples:ubuntu18.04-provdb .
1414

1515
#docker build -f Dockerfile.ad.provdb.gcov -t chimbuko/ad:ubuntu18.04-provdb-coverage .

include/chimbuko/util/Histogram.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,15 @@ namespace chimbuko{
439439
*/
440440
double extractUniformCountInRangeInt(double l, double u);
441441

442+
/**
443+
* @brief Obtain the count of values falling between the given sets of bounds assuming a uniform distribution of points within a bin. The array of bounds must be ordered consecutively.
444+
* The number is rounded to the nearest integer and returned. The data within the range in the histogram is zeroed, creating new edges appropriately
445+
* This array operation is more efficient but otherwise identical to applying extractUniformCountInRangeInt on each of the ranges successively (in the same order!)
446+
* @param edges An ordered list of lower and upper bounds, i.e. with edges[n].first <= edges[n].second for all n, and edges[n+1].first >= edges[n].second
447+
*/
448+
std::vector<double> extractUniformCountInRangesInt(const std::vector<std::pair<double,double> > &edges);
449+
450+
442451
Bin const *getFirst() const{ return first; }
443452
Bin const *getEnd() const{ return end; }
444453
private:

src/util/Histogram.cpp

Lines changed: 100 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -159,13 +159,19 @@ void Histogram::merge_histograms_uniform_int(Histogram &combined, const Histogra
159159
int nbin_merged = comb_counts.size();
160160
double new_total = 0;
161161

162+
std::vector<std::pair<double,double> > edges(nbin_merged);
162163
for(int b=0;b<nbin_merged;b++){
163-
auto be = combined.binEdges(b);
164-
unsigned int gc = gw.extractUniformCountInRangeInt(be.first,be.second);
165-
unsigned int lc = lw.extractUniformCountInRangeInt(be.first,be.second);
166-
unsigned int val = gc+lc;
167-
verboseStream << "Bin " << b << " range " << be.first << " to " << be.second << ": gc=" << gc << " lc=" << lc << " val=" << val << std::endl;
168-
164+
edges[b] = combined.binEdges(b);
165+
if(b>0) edges[b].first = edges[b-1].second; //eliminate floating point errors
166+
}
167+
168+
std::vector<double> gc = gw.extractUniformCountInRangesInt(edges);
169+
std::vector<double> lc = lw.extractUniformCountInRangesInt(edges);
170+
171+
for(int b=0;b<nbin_merged;b++){
172+
unsigned int gcc = gc[b], lcc = lc[b];
173+
unsigned int val = lcc + gcc;
174+
verboseStream << "Bin " << b << " range " << edges[b].first << " to " << edges[b].second << ": gc=" << gcc << " lc=" << lcc << " val=" << val << std::endl;
169175
comb_counts[b] += val;
170176
new_total += val;
171177
}
@@ -963,3 +969,91 @@ double HistogramVBW::extractUniformCountInRangeInt(double l, double u){
963969
h->c = 0;
964970
return out;
965971
}
972+
973+
974+
std::vector<double> HistogramVBW::extractUniformCountInRangesInt(const std::vector<std::pair<double,double> > &edges){
975+
std::vector<double> out(edges.size(), 0.);
976+
if(edges.size()==0) return out;
977+
978+
if(first == nullptr) fatal_error("Histogram is empty");
979+
980+
if(m_max == m_min){
981+
//Ignore the bin edges, the data set is a delta function
982+
double v = m_max;
983+
Bin* bin = (Bin*)getBin(v);
984+
double count = bin->c;
985+
for(size_t idx = 0; idx < edges.size(); idx++){
986+
auto const &be = edges[idx];
987+
double l = be.first, u = be.second;
988+
if(u<=l) fatal_error(std::string("Invalid range, require u>l but got l=") + std::to_string(l) + " u=" + std::to_string(u));
989+
990+
if(l < v && u>= v){
991+
verboseStream << "extractUniformCountInRangeInt range " << l << ":" << u << " evaluating for max=min=" << v << ": data are in bin with count " << count << std::endl;
992+
bin->c = 0;
993+
out[idx] = count;
994+
break; //all future entries already 0
995+
}
996+
}
997+
}
998+
999+
Bin* last = end->left;
1000+
Bin* prev_upper = first;
1001+
1002+
for(size_t idx = 0; idx < edges.size(); idx++){
1003+
auto const &be = edges[idx];
1004+
double l = be.first, u = be.second;
1005+
if(u<=l) fatal_error(std::string("Invalid range, require u>l but got l=") + std::to_string(l) + " u=" + std::to_string(u));
1006+
if(idx>0 && l< edges[idx-1].second) fatal_error("Expect edges to be ordered");
1007+
1008+
Bin* bl = Bin::getBin(prev_upper, l);
1009+
1010+
if(bl != nullptr){
1011+
verboseStream << "Lower edge " << l << " in bin " << *bl << std::endl;
1012+
bl = Bin::split(bl,l).second;
1013+
if(bl->is_end){
1014+
verboseStream << "Right of split point is end" << std::endl;
1015+
//If the split point matches the upper edge of the last bin, the .second pointer is END and the entry is 0
1016+
//furthermore, all entries with edges >=l will also have 0 entries, so we don't need to continue
1017+
return out;
1018+
}
1019+
}else if(l <= first->l){ //left edge is left of histogram
1020+
bl = first;
1021+
verboseStream << "Lower edge is left of histogram" << std::endl;
1022+
}else if(l > last->u){ //left edge is right of histogram
1023+
return out;
1024+
}else{
1025+
assert(0);
1026+
}
1027+
1028+
last = end->left; //update last in case it changed
1029+
1030+
Bin* bu = Bin::getBin(bl, u);
1031+
if(bu != nullptr){
1032+
bu = Bin::split(bu,u).first;
1033+
}else if(u <= first->l){ //right edge is left of histogram
1034+
continue;
1035+
}else if(u > last->u){ //right edge is right of histogram
1036+
verboseStream << "Upper edge is right of histogram" << std::endl;
1037+
bu = last;
1038+
}else{
1039+
assert(0);
1040+
}
1041+
1042+
last = end->left;
1043+
1044+
verboseStream << "Zeroing bins between " << *bl << " and " << *bu << std::endl;
1045+
Bin* h = bl;
1046+
while(h != bu){
1047+
out[idx] += h->c;
1048+
h->c = 0;
1049+
h = h->right;
1050+
}
1051+
1052+
out[idx] += h->c;
1053+
h->c = 0;
1054+
1055+
prev_upper = bu;
1056+
}
1057+
1058+
return out;
1059+
}

0 commit comments

Comments
 (0)