Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Common/ML/include/ML/OrtInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ class OrtModel
private:
// ORT variables -> need to be hidden as pImpl
struct OrtVariables;
OrtVariables* mPImplOrt;
std::shared_ptr<OrtVariables> mPImplOrt = nullptr;
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you use a shared_ptr and no unique_ptr?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because I can't assign a nullptr in the header file since OrtVariables is not known there. It will throw an error for invalid size of incomplete type.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You have to declare the constructor and destructor in the header, but instanciate it in the cxx.
I.e. only OrtInterface() in the header, and
OrtInterface::OrtInterface() = default; in the cxx.
Then you can use unique_ptr.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any reason why we need a unique_ptr? Especially for the OrtAllocator (which gets created only for one environment but used in multiple), shared_ptr would probably be preferable. Unique_ptr would also require a deleter, which will make the code more cumbersome...


// Input & Output specifications of the loaded network
std::vector<const char*> mInputNamesChar, mOutputNamesChar;
Expand Down
19 changes: 12 additions & 7 deletions Common/ML/src/OrtInterface.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c
// General purpose
void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsMap)
{
mPImplOrt = new OrtVariables();
mPImplOrt = std::make_shared<OrtVariables>();

// Load from options map
if (!optionsMap.contains("model-path")) {
Expand Down Expand Up @@ -147,8 +147,8 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
(mPImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
(mPImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
// Arena memory shrinkage comes at performance cost
/// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
// (mPImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27
// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
(mPImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27

std::string dev_mem_str = "";
if (mDeviceType == "ROCM") {
Expand Down Expand Up @@ -308,6 +308,14 @@ void OrtModel::inference(I* input, int64_t input_size, O* output)
(mPImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor);

(mPImplOrt->session)->Run(mPImplOrt->runOptions, *mPImplOrt->ioBinding);
// mPImplOrt->session->Run(
// mPImplOrt->runOptions,
// mInputNamesChar.data(),
// &inputTensor,
// mInputNamesChar.size(),
// mOutputNamesChar.data(),
// &outputTensor,
// mOutputNamesChar.size());
}

template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t*, int64_t, OrtDataType::Float16_t*);
Expand Down Expand Up @@ -427,10 +435,7 @@ template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Fl
// Release session
void OrtModel::release(bool profilingEnabled)
{
// if (profilingEnabled) {
// mPImplOrt->session->EndProfiling();
// }
LOG(info) << "(ORT) Size of mPImplOrt: " << sizeof(*mPImplOrt) << " bytes";
mPImplOrt.reset();
}

// private
Expand Down
33 changes: 22 additions & 11 deletions GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
// But environment must be valid, so we init the model environment first and use it here afterwards.
// Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator.
// TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor
// nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
// if (lane == 0) {
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
// }
// recreateMemoryAllocator = true;
(nnApplications[lane].mModelClass).initSession();
}
Expand All @@ -670,7 +672,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
}
// (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv());
(nnApplications[lane].mModelReg1).initEnvironment();
// nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
(nnApplications[lane].mModelReg1).initSession();
}
if (nnApplications[lane].mModelsUsed[2]) {
Expand All @@ -679,8 +681,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) {
nnApplications[lane].mModelReg2.setIntraOpNumThreads(maxThreads);
}
// (nnApplications[lane].mModelReg2).setEnv((nnApplications[lane].mModelClass).getEnv());
(nnApplications[lane].mModelReg2).initEnvironment();
// nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
(nnApplications[lane].mModelReg2).initSession();
}
if (nn_settings.nnClusterizerVerbosity < 3) {
Expand All @@ -706,8 +709,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
if (doGPU) {
WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
}
LOG(info) << "Size of nnApplications[lane]: " << sizeof(nnApplications[0]) << " bytes";
LOG(info) << "Size of nnApplications: " << sizeof(GPUTPCNNClusterizerHost) * GetProcessingSettings().nTPCClustererLanes << " bytes";
}
#endif

Expand Down Expand Up @@ -975,6 +976,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];

// // bool recreateMemoryAllocator = false;
// if (lane == 0) {
// (nnApplications[lane].mModelClass).initEnvironment();
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, 0);
// }
// // recreateMemoryAllocator = true;
// (nnApplications[lane].mModelClass).initSession();
// (nnApplications[lane].mModelReg1).initSession();

int withMC = (doGPU && propagateMCLabels);

if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
Expand Down Expand Up @@ -1187,12 +1197,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
}
}
for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
// if (GetProcessingSettings().nn.applyNNclusterizer) {
// GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
// nnApplication.mModelClass.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
// nnApplication.mModelReg1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
// nnApplication.mModelReg2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
// }
if (GetProcessingSettings().nn.applyNNclusterizer) {
LOG(info) << "(ORT) Environment releasing...";
GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
nnApplication.mModelClass.release(true);
nnApplication.mModelReg1.release(true);
nnApplication.mModelReg2.release(true);
}
if (transferRunning[i]) {
ReleaseEvent(mEvents->stream[i], doGPU);
}
Expand Down
23 changes: 11 additions & 12 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ struct MockedOrtAllocator : OrtAllocator {
std::atomic<size_t> memory_inuse{0};
std::atomic<size_t> num_allocations{0};
std::atomic<size_t> num_reserve_allocations{0};
OrtMemoryInfo* memory_info;
GPUReconstruction* rec;
OrtMemoryInfo* mMemoryInfoInternal;
GPUReconstruction* mRecInternal;
};

MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info)
Expand All @@ -147,37 +147,36 @@ MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info
OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };
OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };
rec = r;
memory_info = info;
mRecInternal = r;
mMemoryInfoInternal = info;
}

MockedOrtAllocator::~MockedOrtAllocator()
{
// Ort::GetApi().ReleaseMemoryInfo(memory_info);
// Ort::GetApi().ReleaseMemoryInfo(mMemoryInfoInternal);
(void)0; // Suppress warning for empty destructor
}

void* MockedOrtAllocator::Alloc(size_t size)
{
// LOG(info) << "(ORT) Allocating volatile memory of size " << size << " bytes";
return rec->AllocateVolatileDeviceMemory(size);
LOG(info) << "(ORT) Allocating direct memory of size " << size << " bytes";
return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
}

void* MockedOrtAllocator::Reserve(size_t size)
{
// LOG(info) << "(ORT) Reserving volatile memory of size " << size << " bytes";
return rec->AllocateVolatileDeviceMemory(size);
LOG(info) << "(ORT) Reserving direct memory of size " << size << " bytes";
return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
}

void MockedOrtAllocator::Free(void* p)
{
// LOG(info) << "(ORT) Freeing volatile memory " << p;
rec->ReturnVolatileDeviceMemory();
}

const OrtMemoryInfo* MockedOrtAllocator::Info() const
{
return memory_info;
return mMemoryInfoInternal;
}

size_t MockedOrtAllocator::NumAllocations() const
Expand All @@ -197,7 +196,7 @@ void MockedOrtAllocator::LeakCheck()
}
}

void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
void GPUTPCNNClusterizerHost::directOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
{
mMockedAlloc = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)(*memInfo));
if (recreate) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class GPUTPCNNClusterizerHost
void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);

// ONNX
void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
void directOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
MockedOrtAllocator* getMockedAllocator();
const OrtMemoryInfo* getMockedMemoryInfo();

Expand Down
Loading