Skip to content

Commit 92e3ac6

Browse files
committed
ProvDBpruneInterface can now be provided with a prune_type. The associated collection is scanned and events *not* of this type are removed. Added unit test form Normal prune_type
ProvDBtester now cleans up its unqlite files on destruction, allowing multiple instances to be created in a single executable (not at the same time though!)
1 parent f24fe5f commit 92e3ac6

6 files changed

Lines changed: 101 additions & 11 deletions

File tree

include/chimbuko/modules/performance_analysis/provdb/ProvDBpruneInterface.hpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,11 @@ namespace chimbuko {
1515
*/
1616
class ProvDBpruneInterface: public ADDataInterface{
1717
public:
18-
ProvDBpruneInterface(const ADOutlier &ad, sonata::Database &db);
18+
/**
19+
* @brief Constructor
20+
* @param prune_type The class/type of recorded event that is being pruned. Events *not* of this type will be removed from the collection after reevaluation
21+
*/
22+
ProvDBpruneInterface(const ADOutlier &ad, sonata::Database &db, ADDataInterface::EventType prune_type);
1923

2024
/**
2125
* @brief Get the values associated with each recorded anomaly
@@ -39,6 +43,7 @@ namespace chimbuko {
3943
std::unique_ptr<sonata::Collection> m_collection;
4044
std::unordered_map<unsigned long, std::vector<std::pair<uint64_t, double> > > m_data; //[fid] -> [ (record_id, value), ... ]
4145
const ADOutlier &m_ad; //the outlier algorithm to allow access to the model data when updating records
46+
ADDataInterface::EventType m_prune_type; //which type to prune
4247
};
4348

4449
}

src/modules/performance_analysis/provdb/ProvDBprune.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ using namespace chimbuko::modules::performance_analysis;
77

88
void ProvDBprune::pruneImplementation(ADOutlier &ad, sonata::Database &db){
99
//Prune the outliers and update scores / model on remaining
10-
ProvDBpruneInterface pi(ad, db);
10+
ProvDBpruneInterface pi(ad, db, ADDataInterface::EventType::Outlier);
1111
ad.run(pi);
1212

1313
//Prune normal execs and update scores / model on remaining

src/modules/performance_analysis/provdb/ProvDBpruneInterface.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,13 @@
66
using namespace chimbuko;
77
using namespace chimbuko::modules::performance_analysis;
88

9-
ProvDBpruneInterface::ProvDBpruneInterface(const ADOutlier &ad, sonata::Database &db): m_database(db), m_ad(ad), ADDataInterface(){
10-
m_collection.reset(new sonata::Collection(db.open("anomalies")));
9+
ProvDBpruneInterface::ProvDBpruneInterface(const ADOutlier &ad, sonata::Database &db, ADDataInterface::EventType prune_type): m_database(db), m_ad(ad), m_prune_type(prune_type), ADDataInterface(){
10+
std::string coll = m_prune_type == ADDataInterface::EventType::Outlier ? "anomalies" : "normalexecs";
11+
m_collection.reset(new sonata::Collection(db.open(coll)));
1112
//To avoid loading all items into memory we must loop over the database using its jx9 interface
1213
std::string script = R"(
1314
$ret = [];
14-
while( ($rec = db_fetch('anomalies')) != NULL ){
15+
while( ($rec = db_fetch(')" + coll + R"(')) != NULL ){
1516
$r = [ $rec.__id, $rec.fid, $rec.runtime_exclusive ];
1617
array_push($ret, $r);
1718
}
@@ -52,8 +53,10 @@ void ProvDBpruneInterface::recordDataSetLabelsInternal(const std::vector<Elem> &
5253
std::vector<uint64_t> to_update;
5354
std::vector<double> update_scores;
5455

56+
ADDataInterface::EventType type_to_remove = m_prune_type == ADDataInterface::EventType::Outlier ? ADDataInterface::EventType::Normal : ADDataInterface::EventType::Outlier;
57+
5558
for(auto const &e: data){ //we used the record id as the index, which is unique
56-
if(e.label == ADDataInterface::EventType::Normal){
59+
if(e.label == type_to_remove){
5760
verboseStream << "Pruning record " << e.index << " with value " << e.value << " and score " << e.score << std::endl;
5861
to_prune.push_back(e.index);
5962
}else{

test/unit_tests/modules/performance_analysis/provdb/ProvDBpruneInterface.cpp

Lines changed: 80 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
using namespace chimbuko;
1010
using namespace chimbuko::modules::performance_analysis;
1111

12-
TEST(TestProvDBpruneOutlierInterface, basic){
12+
TEST(TestProvDBpruneOutlierInterface, pruneNormalFromAnomalies){
1313
int nshards = 1;
1414
ProvDBmoduleSetup setup;
1515
ProvDBtester pdb(nshards, setup);
@@ -52,17 +52,94 @@ TEST(TestProvDBpruneOutlierInterface, basic){
5252
std::unique_ptr<ADOutlier> ad(ADOutlier::set_algorithm(0,"sstd",ap));
5353
ad->setGlobalParameters(param.serialize()); //input model
5454
ad->setGlobalModelSyncFrequency(0); //fix model
55-
ProvDBpruneInterface pi(*ad, pdb.getShard(0));
55+
ProvDBpruneInterface pi(*ad, pdb.getShard(0), ADDataInterface::EventType::Outlier);
5656
ad->run(pi);
5757
}
5858

5959
auto all_data = main_client.retrieveAllData("anomalies");
6060
EXPECT_EQ(all_data.size(), 2);
6161
for(auto const &e : all_data){
6262
nlohmann::json je = nlohmann::json::parse(e);
63-
EXPECT_GE(je["runtime_exclusive"].template get<double>(), 1000);
6463
std::cout << je.dump(4) << std::endl;
64+
double runtime = je["runtime_exclusive"].template get<double>();
65+
EXPECT_GE(runtime, 1000);
66+
67+
//Check scores
68+
double expect_score = (runtime - mean)/stddev; //sstd
69+
EXPECT_NEAR(je["outlier_score"].template get<double>(), expect_score, 1e-5);
70+
71+
//Check model
72+
EXPECT_NEAR(je["algo_params"]["mean"].template get<double>(), mean, 1e-8);
73+
EXPECT_NEAR(je["algo_params"]["stddev"].template get<double>(), stddev, 1e-8);
74+
}
75+
76+
}
77+
78+
TEST(TestProvDBpruneOutlierInterface, pruneAnomaliesFromNormal){
79+
int nshards = 1;
80+
ProvDBmoduleSetup setup;
81+
ProvDBtester pdb(nshards, setup);
82+
83+
ADProvenanceDBclient main_client(setup.getMainDBcollections(), 0);
84+
main_client.connectSingleServer(pdb.getAddr(),nshards);
85+
86+
nlohmann::json anom;
87+
anom["runtime_exclusive"] = 100;
88+
anom["fid"] = 1234;
89+
anom["blah"] = "norm";
90+
91+
main_client.sendData(anom, "normalexecs");
92+
93+
anom["runtime_exclusive"] = 1000;
94+
anom["fid"] = 1234;
95+
anom["blah"] = "real_anom1";
96+
97+
main_client.sendData(anom, "normalexecs");
98+
99+
anom["runtime_exclusive"] = 1200;
100+
anom["fid"] = 1234;
101+
anom["blah"] = "real_anom2";
102+
103+
main_client.sendData(anom, "normalexecs");
104+
105+
double mean = 100;
106+
double stddev = 10;
107+
int count = 1000;
108+
109+
SstdParam param;
110+
param[1234].set_eta(mean);
111+
param[1234].set_rho(pow(stddev,2) * (count-1) );
112+
param[1234].set_count(count);
113+
114+
EXPECT_EQ( main_client.retrieveAllData("normalexecs").size(), 3 );
115+
116+
ADOutlier::AlgoParams ap; ap.sstd_sigma = 5;
117+
{
118+
std::unique_ptr<ADOutlier> ad(ADOutlier::set_algorithm(0,"sstd",ap));
119+
ad->setGlobalParameters(param.serialize()); //input model
120+
ad->setGlobalModelSyncFrequency(0); //fix model
121+
ProvDBpruneInterface pi(*ad, pdb.getShard(0), ADDataInterface::EventType::Normal);
122+
ad->run(pi);
123+
}
124+
125+
auto all_data = main_client.retrieveAllData("normalexecs");
126+
EXPECT_EQ(all_data.size(), 1);
127+
for(auto const &e : all_data){
128+
nlohmann::json je = nlohmann::json::parse(e);
129+
std::cout << je.dump(4) << std::endl;
130+
double runtime = je["runtime_exclusive"].template get<double>();
131+
EXPECT_EQ(runtime, 100);
132+
133+
//Check scores
134+
double expect_score = (runtime - mean)/stddev; //sstd
135+
EXPECT_NEAR(je["outlier_score"].template get<double>(), expect_score, 1e-5);
136+
137+
//Check model
138+
EXPECT_NEAR(je["algo_params"]["mean"].template get<double>(), mean, 1e-8);
139+
EXPECT_NEAR(je["algo_params"]["stddev"].template get<double>(), stddev, 1e-8);
65140
}
66141

67142
}
68143

144+
145+

test/unit_tests/modules/performance_analysis/provdb/ProvDBtester.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include<chimbuko/core/util/string.hpp>
44
#include<unistd.h>
55
#include<sys/types.h>
6+
#include<stdio.h>
67

78
using namespace chimbuko;
89

@@ -17,15 +18,17 @@ ProvDBtester::ProvDBtester(int nshards, const ProvDBmoduleSetupCore &setup){
1718
pid_t pid = getpid();
1819

1920
std::string glob_db_name = "provdb.global";
20-
std::string glob_db_config = stringize("{ \"path\" : \"/tmp/%s.%lu.unqlite\" }", glob_db_name.c_str(),pid);
21+
std::string fn = stringize("/tmp/%s.%lu.unqlite", glob_db_name.c_str(),pid); filenames.push_back(fn);
22+
std::string glob_db_config = stringize("{ \"path\" : \"%s\" }", fn.c_str());
2123
admin->createDatabase(addr, 0, glob_db_name, "unqlite", glob_db_config);
2224

2325
db_shard_names.resize(nshards);
2426
shard_providers.resize(nshards);
2527
for(int s=0;s<nshards;s++){
2628
shard_providers[s] = new sonata::Provider(engine, s+1);
2729
std::string db_name = stringize("provdb.%d",s);
28-
std::string config = stringize("{ \"path\" : \"/tmp/%s.%lu.unqlite\" }", db_name.c_str(),pid);
30+
fn = stringize("/tmp/%s.%lu.unqlite", db_name.c_str(),pid); filenames.push_back(fn);
31+
std::string config = stringize("{ \"path\" : \"%s\" }", fn.c_str());
2932
admin->createDatabase(addr, s+1, db_name, "unqlite", config);
3033
db_shard_names[s] = db_name;
3134
}
@@ -50,4 +53,5 @@ ProvDBtester::~ProvDBtester(){
5053
delete admin;
5154
delete glob_provider;
5255
for(int i=0;i<shard_providers.size();i++) delete shard_providers[i];
56+
for(auto const &fn : filenames) assert(remove(fn.c_str()) == 0);
5357
}

test/unit_tests/modules/performance_analysis/provdb/ProvDBtester.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ namespace chimbuko{
1616
std::vector<std::string> db_shard_names;
1717
std::vector<sonata::Database> db;
1818
sonata::Database glob_db;
19+
std::vector<std::string> filenames;
1920
public:
2021
ProvDBtester(int nshards, const ProvDBmoduleSetupCore &setup);
2122

0 commit comments

Comments
 (0)