diff --git a/.gitignore b/.gitignore index e3dc9796a..190ca29d3 100644 --- a/.gitignore +++ b/.gitignore @@ -464,5 +464,4 @@ FodyWeavers.xsd *.sln.iml # SPTAG benchmark generated artifacts -/perftest_* -/evaluation/2026-04-23/output_distributed_hostname_*.json +*perftest_* diff --git a/.vscode/launch.json b/.vscode/launch.json index 0cb6b4ec2..c8b6c8490 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -101,7 +101,7 @@ }, { "name": "LD_PRELOAD", - "value": "/usr/lib/gcc/x86_64-linux-gnu/11/libasan.so" + "value": "/usr/lib/gcc/x86_64-linux-gnu/13/libasan.so" }, { "name": "PCI_ALLOWED", @@ -170,7 +170,7 @@ }, { "name": "LD_PRELOAD", - "value": "/usr/lib/gcc/x86_64-linux-gnu/11/libasan.so" + "value": "/usr/lib/gcc/x86_64-linux-gnu/13/libasan.so" }, { "name": "PCI_ALLOWED", diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index ea848af74..fe3d306a1 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -53,73 +53,6 @@ extern "C" bool RocksDbIOUringEnable() { return true; } namespace SPTAG::SPANN { - // Simple sharded LRU cache for posting vector counts. - // Thread-safe: each shard has its own mutex. - class PostingCountCache { - public: - PostingCountCache(size_t capacity = 100000, int shards = 16) - : m_shards(shards), m_capacity(std::max(capacity / shards, (size_t)1)) { - m_data.resize(shards); - m_mutexes = std::make_unique(shards); - } - - // Returns (count, true) on hit, (0, false) on miss. - std::pair Get(SizeType headID) { - int s = Shard(headID); - std::lock_guard lock(m_mutexes[s]); - auto& shard = m_data[s]; - auto it = shard.map.find(headID); - if (it == shard.map.end()) return {0, false}; - // Move to front (most recently used) - shard.order.splice(shard.order.begin(), shard.order, it->second); - return {it->second->second, true}; - } - - void Put(SizeType headID, int count) { - int s = Shard(headID); - std::lock_guard lock(m_mutexes[s]); - auto& shard = m_data[s]; - auto it = shard.map.find(headID); - if (it != shard.map.end()) { - it->second->second = count; - shard.order.splice(shard.order.begin(), shard.order, it->second); - return; - } - // Evict if full - if (shard.map.size() >= m_capacity) { - auto& back = shard.order.back(); - shard.map.erase(back.first); - shard.order.pop_back(); - } - shard.order.emplace_front(headID, count); - shard.map[headID] = shard.order.begin(); - } - - void Remove(SizeType headID) { - int s = Shard(headID); - std::lock_guard lock(m_mutexes[s]); - auto& shard = m_data[s]; - auto it = shard.map.find(headID); - if (it != shard.map.end()) { - shard.order.erase(it->second); - shard.map.erase(it); - } - } - - private: - int Shard(SizeType headID) const { return static_cast(headID) % m_shards; } - - struct ShardData { - std::list> order; // front = MRU - std::unordered_map>::iterator> map; - }; - - int m_shards; - size_t m_capacity; // per shard - std::vector m_data; - std::unique_ptr m_mutexes; - }; - template class ExtraDynamicSearcher : public IExtraSearcher { @@ -128,18 +61,17 @@ namespace SPTAG::SPANN { private: ExtraDynamicSearcher* m_extraIndex; SizeType m_headID; - bool m_disableReassign; std::function m_callback; public: - MergeAsyncJob(ExtraDynamicSearcher* extraIndex, SizeType headID, bool disableReassign, std::function p_callback) - : m_extraIndex(extraIndex), m_headID(headID), m_disableReassign(disableReassign), m_callback(std::move(p_callback)) {} + MergeAsyncJob(ExtraDynamicSearcher* extraIndex, SizeType headID, std::function p_callback) + : m_extraIndex(extraIndex), m_headID(headID), m_callback(std::move(p_callback)) {} ~MergeAsyncJob() {} inline void exec(IAbortOperation* p_abort) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Cannot support job.exec(abort)!\n"); } inline void exec(void* p_workSpace, IAbortOperation* p_abort) override { - ErrorCode ret = m_extraIndex->MergePostings((ExtraWorkSpace*)p_workSpace, m_headID, !m_disableReassign); + ErrorCode ret = m_extraIndex->MergePostings((ExtraWorkSpace*)p_workSpace, m_headID); if (ret != ErrorCode::Success) m_extraIndex->m_asyncStatus = ret; m_extraIndex->m_mergeJobsInFlight--; @@ -155,11 +87,10 @@ namespace SPTAG::SPANN { private: ExtraDynamicSearcher* m_extraIndex; SizeType m_headID; - bool m_disableReassign; std::function m_callback; public: - SplitAsyncJob(ExtraDynamicSearcher* extraIndex, SizeType headID, bool disableReassign, std::function p_callback) - : m_extraIndex(extraIndex), m_headID(headID), m_disableReassign(disableReassign), m_callback(std::move(p_callback)) {} + SplitAsyncJob(ExtraDynamicSearcher* extraIndex, SizeType headID, std::function p_callback) + : m_extraIndex(extraIndex), m_headID(headID), m_callback(std::move(p_callback)) {} ~SplitAsyncJob() {} inline void exec(IAbortOperation* p_abort) { @@ -167,7 +98,7 @@ namespace SPTAG::SPANN { } inline void exec(void* p_workSpace, IAbortOperation* p_abort) override { auto splitStart = std::chrono::high_resolution_clock::now(); - ErrorCode ret = m_extraIndex->Split((ExtraWorkSpace*)p_workSpace, m_headID, !m_disableReassign); + ErrorCode ret = m_extraIndex->Split((ExtraWorkSpace*)p_workSpace, m_headID); auto splitEnd = std::chrono::high_resolution_clock::now(); uint64_t elapsedUs = std::chrono::duration_cast(splitEnd - splitStart).count(); m_extraIndex->m_totalSplitTimeUs += elapsedUs; @@ -183,6 +114,33 @@ namespace SPTAG::SPANN { } }; + class AppendAsyncJob : public Helper::ThreadPool::Job + { + private: + ExtraDynamicSearcher* m_extraIndex; + SizeType m_headID; + std::shared_ptr m_vectorInfo; + std::function m_callback; + public: + AppendAsyncJob(ExtraDynamicSearcher* extraIndex, SizeType headID, std::shared_ptr vectorInfo, std::function p_callback) + : m_extraIndex(extraIndex), m_headID(headID), m_vectorInfo(std::move(vectorInfo)), m_callback(std::move(p_callback)) {} + + ~AppendAsyncJob() {} + inline void exec(IAbortOperation* p_abort) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Cannot support job.exec(abort)!\n"); + } + inline void exec(void* p_workSpace, IAbortOperation* p_abort) override { + ErrorCode ret = m_extraIndex->Append((ExtraWorkSpace*)p_workSpace, m_headID, (int)(m_vectorInfo->size() / m_extraIndex->m_vectorInfoSize), *m_vectorInfo); + if (ret != ErrorCode::Success) + m_extraIndex->m_asyncStatus = ret; + m_extraIndex->m_appendJobsInFlight--; + m_extraIndex->m_totalAppendCompleted++; + if (m_callback != nullptr) { + m_callback(); + } + } + }; + class ReassignAsyncJob : public Helper::ThreadPool::Job { private: @@ -249,9 +207,6 @@ namespace SPTAG::SPANN { }; private: - std::shared_ptr> m_freeWorkSpaceIds; - std::atomic m_workspaceCount = 0; - std::shared_ptr db; SPANN::Index* m_headIndex; @@ -279,6 +234,9 @@ namespace SPTAG::SPANN { std::atomic_size_t m_totalMergeSubmitted{ 0 }; std::atomic_size_t m_totalMergeCompleted{ 0 }; + std::atomic_size_t m_appendJobsInFlight{ 0 }; + std::atomic_size_t m_totalAppendSubmitted{ 0 }; + std::atomic_size_t m_totalAppendCompleted{ 0 }; std::atomic_size_t m_totalAppendCount{ 0 }; std::atomic_size_t m_reassignJobsInFlight{ 0 }; @@ -292,22 +250,21 @@ namespace SPTAG::SPANN { size_t m_lastProgressLogQueueSize = std::numeric_limits::max(); size_t m_lastProgressLogSplit = std::numeric_limits::max(); size_t m_lastProgressLogMerge = std::numeric_limits::max(); + size_t m_lastProgressLogAppend = std::numeric_limits::max(); size_t m_lastProgressLogReassign = std::numeric_limits::max(); - // Posting count cache for multi-chunk mode. - // Tracks approximate vector count per posting to decide when to split. - std::unique_ptr m_postingCountCache; - bool ShouldLogProgress(size_t totalJobs, bool force = false) { auto now = std::chrono::steady_clock::now(); std::lock_guard lock(m_progressLogMutex); size_t splitJobs = m_splitJobsInFlight.load(); size_t mergeJobs = m_mergeJobsInFlight.load(); + size_t appendJobs = m_appendJobsInFlight.load(); size_t reassignJobs = m_reassignJobsInFlight.load(); bool queueChanged = (totalJobs != m_lastProgressLogQueueSize) || (splitJobs != m_lastProgressLogSplit) || (mergeJobs != m_lastProgressLogMerge) || + (appendJobs != m_lastProgressLogAppend) || (reassignJobs != m_lastProgressLogReassign); if (force) { @@ -315,6 +272,7 @@ namespace SPTAG::SPANN { m_lastProgressLogQueueSize = totalJobs; m_lastProgressLogSplit = splitJobs; m_lastProgressLogMerge = mergeJobs; + m_lastProgressLogAppend = appendJobs; m_lastProgressLogReassign = reassignJobs; return true; } @@ -329,6 +287,7 @@ namespace SPTAG::SPANN { m_lastProgressLogQueueSize = totalJobs; m_lastProgressLogSplit = splitJobs; m_lastProgressLogMerge = mergeJobs; + m_lastProgressLogAppend = appendJobs; m_lastProgressLogReassign = reassignJobs; } return shouldLog; @@ -355,6 +314,7 @@ namespace SPTAG::SPANN { } // Initialize version map: TiKV-backed or local +#ifdef TIKV if (p_opt.m_storage == Storage::TIKVIO && p_opt.m_distributedVersionMap) { auto tikvMap = std::make_unique(); tikvMap->SetDB(db); @@ -365,7 +325,9 @@ namespace SPTAG::SPANN { m_versionMap = std::move(tikvMap); SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Using distributed TiKV VersionMap (layer=%d, chunkSize=%d, cacheTTL=%dms, cacheMax=%d)\n", layer, p_opt.m_versionChunkSize, p_opt.m_versionCacheTTLMs, p_opt.m_versionCacheMaxChunks); - } else { + } else +#endif + { m_versionMap = std::make_unique(); SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Using local in-memory VersionMap (layer=%d)\n", layer); } @@ -377,14 +339,6 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Posting size limit: %d, search limit: %f, merge threshold: %d\n", m_postingSizeLimit, p_opt.m_latencyLimit, m_mergeThreshold); SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "[CONFIG] layer=%d DistributedVersionMap=%s SearchCheckVersionMapOnlyLayer0=%s UseMultiChunkPosting=%s PostingPageLimit=%d\n", layer, p_opt.m_distributedVersionMap ? "true" : "false", p_opt.m_searchCheckVersionMapOnlyLayer0 ? "true" : "false", p_opt.m_useMultiChunkPosting ? "true" : "false", p_opt.m_postingPageLimit); - - // Initialize posting count cache for multi-chunk mode - if (p_opt.m_useMultiChunkPosting && p_opt.m_storage == Storage::TIKVIO) { - size_t postingCountCacheCapacity = static_cast(std::max(p_opt.m_postingCountCacheCapacity, 1)); - m_postingCountCache = std::make_unique(postingCountCacheCapacity, 16); - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "PostingCountCache initialized (capacity=%zu, shards=16) for layer %d\n", - postingCountCacheCapacity, layer); - } } ~ExtraDynamicSearcher() {} @@ -470,9 +424,9 @@ namespace SPTAG::SPANN { SPANN::Index* GetHeadIndex() const { return m_headIndex; } - bool CheckIsNeedReassign(std::vector>& newHeadsVec, ValueType* data, std::shared_ptr splitHeadVec, float_t headToSplitHeadDist, float_t currentHeadDist, bool isInSplitHead) + bool CheckIsNeedReassign(std::vector>& newHeadsVec, const ValueType* data, const ValueType* splitHeadVec, float_t headToSplitHeadDist, float_t currentHeadDist, bool isInSplitHead) { - float_t splitHeadDist = m_headIndex->ComputeDistance(data, splitHeadVec->data()); + float_t splitHeadDist = m_headIndex->ComputeDistance(data, splitHeadVec); if (isInSplitHead) { if (splitHeadDist >= currentHeadDist) return false; @@ -545,7 +499,7 @@ namespace SPTAG::SPANN { // ForceCompaction std::string postingList; - if ((ret = GetPostingFromDB(globalID, &postingList, MaxTimeout, &(workSpace.m_diskRequests))) != + if ((ret = db->Get(DBKey(globalID), &postingList, MaxTimeout, &(workSpace.m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, @@ -591,7 +545,7 @@ namespace SPTAG::SPANN { if (vectorCount <= m_mergeThreshold) mergelist.insert(globalID); postingList.resize(vectorCount * m_vectorInfoSize); - if ((ret = PutPostingToDB(globalID, postingList, MaxTimeout, + if ((ret = db->Put(DBKey(globalID), postingList, MaxTimeout, &(workSpace.m_diskRequests))) != ErrorCode::Success) { @@ -601,6 +555,7 @@ namespace SPTAG::SPANN { finalcode = ret; return; } + CheckCentroid(globalID, postingList, "RefineIndex"); } else { @@ -630,12 +585,31 @@ namespace SPTAG::SPANN { return ErrorCode::Success; } - ErrorCode Split(ExtraWorkSpace* p_exWorkSpace, const SizeType headID, bool reassign = false, bool requirelock = true) + void CheckCentroid(SizeType pid, std::string& posting, std::string where) + { + SizeType postVectorNum = posting.size() / m_vectorInfoSize; + uint8_t* vectorId = reinterpret_cast(posting.data()); + bool hasHead = false; + for (int j = 0; j < postVectorNum; j++, vectorId += m_vectorInfoSize) + { + SizeType VID = *((SizeType*)(vectorId)); + if (VID == pid) { + hasHead = true; + break; + } + } + if (!hasHead) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "CheckCentroid cannot find head in posting! pid:%d, where:%s\n", pid, where.c_str()); + exit(-1); + } + } + + ErrorCode Split(ExtraWorkSpace* p_exWorkSpace, const SizeType headID, bool requirelock = true) { auto splitBegin = std::chrono::high_resolution_clock::now(); std::vector newHeadsID(2, -1); - std::vector> newHeadsVec(2); - std::vector newPostingLists; + std::vector> newHeadsVec(2, nullptr); + std::vector newPostingLists(2, ""); std::shared_ptr headVec; ErrorCode ret; bool theSameHead = false; @@ -662,7 +636,7 @@ namespace SPTAG::SPANN { std::string postingList; auto splitGetBegin = std::chrono::high_resolution_clock::now(); { - if ((ret=GetPostingFromDB(headID, &postingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != + if ((ret=db->Get(DBKey(headID), &postingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, @@ -675,7 +649,7 @@ namespace SPTAG::SPANN { elapsedMSeconds = std::chrono::duration_cast(splitGetEnd - splitGetBegin).count(); m_stat.m_getCost += elapsedMSeconds; // reinterpret postingList to vectors and IDs - auto* postingP = reinterpret_cast(postingList.data()); + uint8_t* postingP = reinterpret_cast(postingList.data()); SizeType postVectorNum = (SizeType)(postingList.size() / m_vectorInfoSize); splitPostingVectors = static_cast(postVectorNum); @@ -685,7 +659,6 @@ namespace SPTAG::SPANN { std::vector localIndices; localIndices.reserve(postVectorNum); uint8_t* vectorId = postingP; - SizeType headj = -1; bool hasHead = false; for (SizeType j = 0; j < postVectorNum; j++, vectorId += m_vectorInfoSize) { @@ -707,19 +680,19 @@ namespace SPTAG::SPANN { } } - if (VID == headID) { - headj = j; - headVec = std::make_shared((char*)vectorId + m_metaDataSize, m_vectorDataSize); - } - //if (VID >= m_versionMap->Count()) SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "DEBUG: vector ID:%d total size:%d\n", VID, m_versionMap->Count()); + if (VID == headID) headVec = std::make_shared((char*)vectorId, m_vectorInfoSize); + + //if (VID >= m_versionMap.Count()) SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "DEBUG: vector ID:%d total size:%d\n", VID, m_versionMap.Count()); if (m_versionMap->Deleted(VID) || m_versionMap->GetVersion(VID) != version) continue; if (VID == headID) hasHead = true; localIndices.push_back(j); } - if (headj < 0) { + if (headVec == nullptr) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Split fail: cannot find head in posting! headID:%lld\n", (std::int64_t)headID); return ErrorCode::Fail; + } else { + *((uint8_t*)(headVec->data() + sizeof(SizeType))) = m_versionMap->GetVersion(headID); } // double gcEndTime = sw.getElapsedMs(); // m_splitGcCost += gcEndTime; @@ -733,14 +706,15 @@ namespace SPTAG::SPANN { memcpy(ptr, postingList.data() + localIndices[j] * m_vectorInfoSize, m_vectorInfoSize); } if (!hasHead) { - Serialize(ptr, headID, m_versionMap->GetVersion(headID), headVec->data()); - localIndices.push_back(headj); + memcpy(ptr, headVec->data(), m_vectorInfoSize); + localIndices.push_back(0); // just to make sure head is included in posting, the index won't be used } postingList.resize(localIndices.size() * m_vectorInfoSize); - if ((ret=PutPostingToDB(headID, postingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + if ((ret=db->Put(DBKey(headID), postingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Split Fail to write back posting %lld\n", (std::int64_t)(headID)); return ret; } + CheckCentroid(headID, postingList, "Split-GC"); m_stat.m_garbageNum++; auto GCEnd = std::chrono::high_resolution_clock::now(); elapsedMSeconds = std::chrono::duration_cast(GCEnd - splitBegin).count(); @@ -775,13 +749,14 @@ namespace SPTAG::SPANN { memcpy(ptr, postingList.c_str() + localIndices[j] * m_vectorInfoSize, m_vectorInfoSize); if (*((SizeType*)(ptr)) == headID) hasHead = true; } - if (!hasHead) memcpy(newpostingList.data(), postingList.c_str() + headj * m_vectorInfoSize, m_vectorInfoSize); + if (!hasHead) memcpy(newpostingList.data(), headVec->data(), m_vectorInfoSize); SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Cluserting Failed (The same vector), Only Keep %d vectors.\n", cut); - if ((ret=PutPostingToDB(headID, newpostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + if ((ret=db->Put(DBKey(headID), newpostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Split fail to override posting cut to limit for posting %lld\n", (std::int64_t)(headID)); return ret; } + CheckCentroid(headID, newpostingList, "Split-one-cluster"); { std::unique_lock tmplock(m_splitListLock); m_splitList.unsafe_erase(headID); @@ -790,14 +765,13 @@ namespace SPTAG::SPANN { } std::vector ks(2, 0); - if (m_headIndex->ComputeDistance(args.centers, headVec->c_str()) < m_headIndex->ComputeDistance(args.centers + args._D, headVec->c_str())) { + if (m_headIndex->ComputeDistance(args.centers, headVec->c_str() + m_metaDataSize) < m_headIndex->ComputeDistance(args.centers + args._D, headVec->c_str() + m_metaDataSize)) { ks[0] = 1; } else { ks[1] = 1; } SizeType newHeadVID = -1; - int first = 0; - newPostingLists.resize(2); + int first = 0; for (int k : ks) { if (args.counts[k] == 0) continue; first = (k == 0) ? 0 : args.counts[0]; @@ -806,19 +780,20 @@ namespace SPTAG::SPANN { for (int j = 0; j < args.counts[k]; j++, ptr += m_vectorInfoSize) { memcpy(ptr, postingList.c_str() + localIndices[first + j] * m_vectorInfoSize, m_vectorInfoSize); - //Serialize(ptr, localIndicesInsert[localIndices[first + j]], localIndicesInsertVersion[localIndices[first + j]], smallSample[localIndices[first + j]]); } - if (!theSameHead && headVec && m_headIndex->ComputeDistance(args.centers + k * args._D, headVec->c_str()) < Epsilon) { + if (!theSameHead && m_headIndex->ComputeDistance(args.centers + k * args._D, headVec->c_str() + m_metaDataSize) < Epsilon) { newHeadsID[k] = headID; - newHeadsVec[k] = headVec; + newHeadsVec[k] = std::make_shared(headVec->c_str() + m_metaDataSize, m_vectorDataSize); newHeadVID = headID; theSameHead = true; - if (!hasHead && headj != -1) newPostingLists[k] += postingList.substr(headj * m_vectorInfoSize, m_vectorInfoSize); + if (!hasHead) newPostingLists[k] += *headVec; + auto splitPutBegin = std::chrono::high_resolution_clock::now(); - if ((ret=PutPostingToDB(newHeadVID, newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + if ((ret=db->Put(DBKey(newHeadVID), newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to override posting %lld\n", (std::int64_t)(newHeadVID)); return ret; } + CheckCentroid(newHeadVID, newPostingLists[k], "Split-SameHead"); auto splitPutEnd = std::chrono::high_resolution_clock::now(); elapsedMSeconds = std::chrono::duration_cast(splitPutEnd - splitPutBegin).count(); m_stat.m_putCost += elapsedMSeconds; @@ -850,19 +825,25 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Split: new head VID %lld is being locked after %d retries. Skip merging and return split failed...\n", (std::int64_t)(newHeadVID), retry); - return ErrorCode::Fail; + { + std::unique_lock tmplock(m_splitListLock); + m_splitList.unsafe_erase(headID); + } + SplitAsync(headID, postingList.size() / m_vectorInfoSize); + return ErrorCode::Success; } } if (m_headIndex->ContainSample(newHeadVID, m_layer + 1)) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Split: new head VID %lld already exists in head index. Do merging...\n", (std::int64_t)(newHeadVID)); + //SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Split: new head VID %lld already exists in head index. Do merging...\n", (std::int64_t)(newHeadVID)); m_stat.m_splitExistingHeadMergeCount.fetch_add(1, std::memory_order_relaxed); + std::string mergedPostingList; std::set vectorIdSet; std::string currentPostingList; { - if ((ret = GetPostingFromDB(newHeadVID, ¤tPostingList, MaxTimeout, + if ((ret = db->Get(DBKey(newHeadVID), ¤tPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to get posting %lld\n", @@ -897,6 +878,10 @@ namespace SPTAG::SPANN { for (int j = 0; j < newPostVectorNum; j++, postingK += m_vectorInfoSize) { SizeType VID = *((SizeType *)(postingK)); + uint8_t version = *(postingK + sizeof(SizeType)); + + if (m_versionMap->Deleted(VID) || m_versionMap->GetVersion(VID) != version) + continue; if (vectorIdSet.find(VID) != vectorIdSet.end()) continue; @@ -906,25 +891,28 @@ namespace SPTAG::SPANN { currentLength++; } - if (currentLength > (m_postingSizeLimit + m_bufferSizeLimit)) + if (currentLength > (m_postingSizeLimit + m_bufferSizeLimit) && m_opt->m_storage == Storage::FILEIO) { + /* SPTAGLIB_LOG( Helper::LogLevel::LL_Warning, "Split: merged posting list length %d exceeds hard limit %d after merging head " "VID %lld. Cut to limit and put back to db.\n", currentLength, m_postingSizeLimit + m_bufferSizeLimit, (std::int64_t)(newHeadVID)); + */ mergedPostingList.resize((m_postingSizeLimit + m_bufferSizeLimit) * m_vectorInfoSize); currentLength = m_postingSizeLimit + m_bufferSizeLimit; } auto splitPutBegin = std::chrono::high_resolution_clock::now(); - if ((ret = PutPostingToDB(newHeadVID, mergedPostingList, MaxTimeout, + if ((ret = db->Put(DBKey(newHeadVID), mergedPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to put posting %lld\n", (std::int64_t)(newHeadVID)); return ret; } + CheckCentroid(newHeadVID, mergedPostingList, "Split-MergePosting"); auto splitPutEnd = std::chrono::high_resolution_clock::now(); elapsedMSeconds = std::chrono::duration_cast(splitPutEnd - splitPutBegin) @@ -938,10 +926,11 @@ namespace SPTAG::SPANN { } } else { auto splitPutBegin = std::chrono::high_resolution_clock::now(); - if ((ret=PutPostingToDB(newHeadVID, newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + if ((ret=db->Put(DBKey(newHeadVID), newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to add new posting %lld\n", (std::int64_t)(newHeadVID)); return ret; } + CheckCentroid(newHeadVID, newPostingLists[k], "Split-NewPosting"); auto splitPutEnd = std::chrono::high_resolution_clock::now(); elapsedMSeconds = std::chrono::duration_cast(splitPutEnd - splitPutBegin).count(); m_stat.m_putCost += elapsedMSeconds; @@ -959,15 +948,14 @@ namespace SPTAG::SPANN { auto updateHeadEnd = std::chrono::high_resolution_clock::now(); elapsedMSeconds = std::chrono::duration_cast(updateHeadEnd - updateHeadBegin).count(); m_stat.m_updateHeadCost += elapsedMSeconds; - - if (m_opt->m_excludehead) m_versionMap->IncVersion(newHeadVID, &version, version); } + if (m_rwLocks.hash_func(newHeadVID) != m_rwLocks.hash_func(headID)) anotherLock.unlock(); } //SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Head id: %d split into : %d, length: %d\n", headID, newHeadVID, args.counts[k]); } if (!theSameHead) { m_headIndex->DeleteIndex(headID, m_layer + 1); - if ((ret=DeletePostingFromDB(headID)) != ErrorCode::Success) + if ((ret=db->Delete(DBKey(headID))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete old posting in Split\n"); return ret; @@ -996,7 +984,8 @@ namespace SPTAG::SPANN { IndexStats::HistAdd(m_stat.m_splitNewHeadCount, splitNewHeadCount); m_stat.m_splitNewHeadCountTotal.fetch_add(splitNewHeadCount, std::memory_order_relaxed); m_stat.m_splitNewHeadSampleCount.fetch_add(1, std::memory_order_relaxed); - if (reassign && headVec) { + + if (!m_opt->m_disableReassign) { auto reassignScanBegin = std::chrono::high_resolution_clock::now(); CollectReAssign(p_exWorkSpace, headID, headVec, newPostingLists, newHeadsID, newHeadsVec, theSameHead); @@ -1012,18 +1001,18 @@ namespace SPTAG::SPANN { return ErrorCode::Success; } - ErrorCode MergePostings(ExtraWorkSpace *p_exWorkSpace, SizeType headID, bool reassign = false) + ErrorCode MergePostings(ExtraWorkSpace *p_exWorkSpace, SizeType headID) { std::unique_lock lock(m_rwLocks[headID]); if (!m_headIndex->ContainSample(headID, m_layer + 1)) { - std::unique_lock lock(m_mergeListLock); + std::unique_lock tmplock(m_mergeListLock); m_mergeList.unsafe_erase(headID); return ErrorCode::Success; } { - std::shared_lock lock(m_mergeListLock); + std::shared_lock tmplock(m_mergeListLock); if (m_mergeList.find(headID) == m_mergeList.end()) { return ErrorCode::Success; } @@ -1035,7 +1024,7 @@ namespace SPTAG::SPANN { std::string currentPostingList; ErrorCode ret; { - if ((ret = GetPostingFromDB(headID, ¤tPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != + if ((ret = db->Get(DBKey(headID), ¤tPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG( @@ -1068,6 +1057,8 @@ namespace SPTAG::SPANN { if (headVec == nullptr) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "MergePostings fail: cannot find head vector in posting! headID:%lld\n", (std::int64_t)headID); return ErrorCode::Fail; + } else { + *((uint8_t*)(headVec->data() + sizeof(SizeType))) = m_versionMap->GetVersion(headID); } if (currentLength > m_mergeThreshold) @@ -1075,12 +1066,13 @@ namespace SPTAG::SPANN { if (vectorIdSet.find(headID) == vectorIdSet.end() && headVec != nullptr) { mergedPostingList += *headVec; } - if ((ret=PutPostingToDB(headID, mergedPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + if ((ret=db->Put(DBKey(headID), mergedPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Merge Fail to write back posting %lld\n", (std::int64_t)headID); return ret; } + CheckCentroid(headID, mergedPostingList, "MergePostings-ignore"); { - std::unique_lock lock(m_mergeListLock); + std::unique_lock tmplock(m_mergeListLock); m_mergeList.unsafe_erase(headID); } return ErrorCode::Success; @@ -1102,7 +1094,6 @@ namespace SPTAG::SPANN { int dedupLength = 0; SizeType nextHeadID = -1; - SizeType deletedHeadID = -1; std::shared_ptr nextHeadVec; std::shared_ptr deletedHeadVec; std::string * deletedPostingList = nullptr; @@ -1114,7 +1105,7 @@ namespace SPTAG::SPANN { // SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"Locked: %d, to be lock: %d\n", headID, queryResult->VID); if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) { if (!anotherLock.try_lock()) { - auto* curJob = new MergeAsyncJob(this, headID, reassign, nullptr); + auto* curJob = new MergeAsyncJob(this, headID, nullptr); // Re-queue counts as a new submission; matched by the // m_mergeJobsInFlight-- / m_totalMergeCompleted++ in // MergeAsyncJob::exec(). Without these increments @@ -1127,7 +1118,7 @@ namespace SPTAG::SPANN { } } if (!m_headIndex->ContainSample(queryResult->VID, m_layer + 1)) continue; - if ((ret=GetPostingFromDB(queryResult->VID, &nextPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + if ((ret=db->Get(DBKey(queryResult->VID), &nextPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to get to be merged posting: %lld, get size:%d\n", (std::int64_t)(queryResult->VID), (int)(nextPostingList.size())); @@ -1162,18 +1153,18 @@ namespace SPTAG::SPANN { if (vectorIdSet.find(headID) == vectorIdSet.end() && nextVectorIdSet.find(headID) == nextVectorIdSet.end() && headVec != nullptr) { mergedPostingList += *headVec; } - if ((ret=PutPostingToDB(headID, mergedPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + if ((ret=db->Put(DBKey(headID), mergedPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "MergePostings fail to override old posting %lld after merge\n", (std::int64_t)headID); return ret; } + CheckCentroid(headID, mergedPostingList, "MergePostings-currentLength >= nextLength"); m_headIndex->DeleteIndex(queryResult->VID, m_layer + 1); - if ((ret=DeletePostingFromDB(queryResult->VID)) != ErrorCode::Success) + if ((ret=db->Delete(DBKey(queryResult->VID))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete old posting %lld in Merge\n", (std::int64_t)(queryResult->VID)); return ret; } nextHeadID = headID; - deletedHeadID = queryResult->VID; nextHeadVec = headVec; deletedHeadVec = resultVec; deletedPostingList = &nextPostingList; @@ -1183,18 +1174,18 @@ namespace SPTAG::SPANN { if (vectorIdSet.find(queryResult->VID) == vectorIdSet.end() && nextVectorIdSet.find(queryResult->VID) == nextVectorIdSet.end() && resultVec != nullptr) { mergedPostingList += *resultVec; } - if ((ret=PutPostingToDB(queryResult->VID, mergedPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + if ((ret=db->Put(DBKey(queryResult->VID), mergedPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "MergePostings fail to override posting %lld after merge\n", (std::int64_t)(queryResult->VID)); return ret; } + CheckCentroid(queryResult->VID, mergedPostingList, "MergePostings-currentLength < nextLength"); m_headIndex->DeleteIndex(headID, m_layer + 1); - if ((ret = DeletePostingFromDB(headID)) != ErrorCode::Success) + if ((ret = db->Delete(DBKey(headID))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete old posting %lld in Merge\n", (std::int64_t)(headID)); return ret; } nextHeadID = queryResult->VID; - deletedHeadID = headID; nextHeadVec = resultVec; deletedHeadVec = headVec; deletedPostingList = ¤tPostingList; @@ -1206,7 +1197,7 @@ namespace SPTAG::SPANN { // SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"Release: %d, Release: %d\n", headID, queryResult->VID); lock.unlock(); - if (reassign) + if (!m_opt->m_disableReassign) { postingP = reinterpret_cast(deletedPostingList->data()); for (int j = 0; j < deletedLength; j++) { @@ -1222,27 +1213,11 @@ namespace SPTAG::SPANN { ReassignAsync(std::make_shared((char*)vectorId, m_vectorInfoSize), nextHeadID); } } - - if (!m_versionMap->Deleted(deletedHeadID)) - { - std::shared_ptr vectorinfo = - std::make_shared(m_vectorInfoSize, ' '); - // deletedHeadVec is the full m_vectorInfoSize record - // ([VID][version][vector]) read from the posting in - // MergePostings (see line ~990). Serialize expects a - // pointer to the raw m_vectorDataSize-byte vector, so - // skip the m_metaDataSize prefix to avoid shifting the - // vector bytes by 5 and corrupting the reassigned data. - Serialize(vectorinfo->data(), deletedHeadID, m_versionMap->GetVersion(deletedHeadID), - deletedHeadVec->data() + m_metaDataSize); - m_stat.m_reassignSubmittedFromMerge.fetch_add(1, std::memory_order_relaxed); - ReassignAsync(vectorinfo, -1); - } } { { - std::unique_lock lock(m_mergeListLock); + std::unique_lock tmplock(m_mergeListLock); m_mergeList.unsafe_erase(headID); m_mergeList.unsafe_erase(queryResult->VID); } @@ -1258,12 +1233,13 @@ namespace SPTAG::SPANN { if (vectorIdSet.find(headID) == vectorIdSet.end() && headVec != nullptr) { mergedPostingList += *headVec; } - if ((ret=PutPostingToDB(headID, mergedPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + if ((ret=db->Put(DBKey(headID), mergedPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Merge Fail to write back posting %lld\n", (std::int64_t)headID); return ret; } + CheckCentroid(headID, mergedPostingList, "MergePostings-GC"); { - std::unique_lock lock(m_mergeListLock); + std::unique_lock tmplock(m_mergeListLock); m_mergeList.unsafe_erase(headID); } return ErrorCode::Success; @@ -1289,7 +1265,7 @@ namespace SPTAG::SPANN { } } - auto* curJob = new SplitAsyncJob(this, headID, m_opt->m_disableReassign, p_callback); + auto* curJob = new SplitAsyncJob(this, headID, p_callback); m_splitJobsInFlight++; m_totalSplitSubmitted++; m_splitThreadPool->add(curJob); @@ -1299,7 +1275,7 @@ namespace SPTAG::SPANN { inline void MergeAsync(SizeType headID, std::function p_callback = nullptr) { { - std::shared_lock lock(m_mergeListLock); + std::shared_lock tmplock(m_mergeListLock); auto res = m_mergeList.insert(headID); if (!res.second) { @@ -1308,25 +1284,41 @@ namespace SPTAG::SPANN { } } - auto* curJob = new MergeAsyncJob(this, headID, m_opt->m_disableReassign, p_callback); + auto* curJob = new MergeAsyncJob(this, headID, p_callback); m_mergeJobsInFlight++; m_totalMergeSubmitted++; m_splitThreadPool->add(curJob); } - inline void ReassignAsync(std::shared_ptr vectorInfo, SizeType headPrev, std::function p_callback = nullptr) + inline void AppendAsync(SizeType headID, std::shared_ptr postingList, bool urgent = false,std::function p_callback = nullptr) + { + auto* curJob = new AppendAsyncJob(this, headID, std::move(postingList), p_callback); + m_appendJobsInFlight++; + m_totalAppendSubmitted++; + if (urgent) { + m_splitThreadPool->addfront(curJob); + } else { + m_splitThreadPool->add(curJob); + } + } + + inline void ReassignAsync(std::shared_ptr vectorInfo, SizeType headPrev, bool urgent = false, std::function p_callback = nullptr) { auto* curJob = new ReassignAsyncJob(this, std::move(vectorInfo), headPrev, p_callback); m_reassignJobsInFlight++; m_totalReassignSubmitted++; - m_splitThreadPool->add(curJob); + if (urgent) { + m_splitThreadPool->addfront(curJob); + } else { + m_splitThreadPool->add(curJob); + } } ErrorCode CollectReAssign(ExtraWorkSpace *p_exWorkSpace, SizeType headID, std::shared_ptr headVec, std::vector &postingLists, std::vector &newHeadsID, std::vector> &newHeadsVec, bool theSameHead) { - auto headVector = reinterpret_cast(headVec->data()); + auto headVector = reinterpret_cast(headVec->data() + m_metaDataSize); // Collect vectors that need reassign, then do RNGSelection inline // and batch Append by target head to reduce TiKV RPCs. @@ -1367,20 +1359,10 @@ namespace SPTAG::SPANN { } }; - if (m_opt->m_excludehead && !theSameHead) - { - if (!m_versionMap->Deleted(headID)) - { - std::shared_ptr vectorinfo = std::make_shared(m_vectorInfoSize, ' '); - Serialize(vectorinfo->data(), headID, m_versionMap->GetVersion(headID), headVector); - // excludehead reassign: use the lambda with headPrev=-1 - tryBatchReassign(reinterpret_cast(vectorinfo->data()), -1); - } - } std::vector newHeadsDist(2, 0.0f); std::set reAssignVectorsTopK; - if (newHeadsVec[0]) newHeadsDist[0] = m_headIndex->ComputeDistance(headVec->data(), newHeadsVec[0]->data()); - if (newHeadsVec[1]) newHeadsDist[1] = m_headIndex->ComputeDistance(headVec->data(), newHeadsVec[1]->data()); + if (newHeadsVec[0]) newHeadsDist[0] = m_headIndex->ComputeDistance(headVector, newHeadsVec[0]->data()); + if (newHeadsVec[1]) newHeadsDist[1] = m_headIndex->ComputeDistance(headVector, newHeadsVec[1]->data()); for (int i = 0; i < postingLists.size(); i++) { if (!newHeadsVec[i]) continue; auto& postingList = postingLists[i]; @@ -1401,7 +1383,7 @@ namespace SPTAG::SPANN { if (reAssignVectorsTopK.find(vid) == reAssignVectorsTopK.end() && !m_versionMap->Deleted(vid) && m_versionMap->GetVersion(vid) == version) { m_stat.m_reAssignScanNum++; float dist = m_headIndex->ComputeDistance(newHeadsVec[i]->data(), vector); - if (CheckIsNeedReassign(newHeadsVec, vector, headVec, newHeadsDist[i], dist, true)) { + if (CheckIsNeedReassign(newHeadsVec, vector, headVector, newHeadsDist[i], dist, true)) { tryBatchReassign(vectorId, newHeadsID[i]); reAssignVectorsTopK.insert(vid); } @@ -1435,22 +1417,13 @@ namespace SPTAG::SPANN { } auto reassignScanIOBegin = std::chrono::high_resolution_clock::now(); ErrorCode ret; - bool reassignReadOk = true; - if (IsMultiChunk()) { - auto* tikvDB = this->GetTiKVDB(); - auto dbKeys = DBKeys(HeadPrevTopK); - if ((ret = tikvDB->MultiScanPostings(*dbKeys, p_exWorkSpace->m_pageBuffers, m_hardLatencyLimit)) != ErrorCode::Success) - { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "ReAssign skipped: couldn't scan nearby postings (non-fatal)\n"); - reassignReadOk = false; - } - } else { + { auto keys = DBKeys(HeadPrevTopK); if ((ret = db->MultiGet(*keys, p_exWorkSpace->m_pageBuffers, m_hardLatencyLimit, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "ReAssign skipped: couldn't read nearby postings (non-fatal)\n"); - reassignReadOk = false; + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "ReAssign can't get all the near postings\n"); + return ret; } } @@ -1458,16 +1431,6 @@ namespace SPTAG::SPANN { auto elapsedMSeconds = std::chrono::duration_cast(reassignScanIOEnd - reassignScanIOBegin).count(); m_stat.m_reassignScanIOCost += elapsedMSeconds; - if (reassignReadOk) { - // IMPORTANT: snapshot each posting buffer into a local std::string - // BEFORE iterating. tryBatchReassign() below calls - // RNGSelection -> SearchHeadIndex -> SearchDiskIndex -> - // searcher->SearchIndex(p_exWorkSpace, ...) which performs its own - // MultiGet/MultiScanPostings into p_exWorkSpace->m_pageBuffers, - // overwriting (or reallocating) the very buffers we are scanning. - // Without this snapshot, the raw `postingP` pointer dangles or is - // mutated mid-loop, leading to records being interpreted as garbage - // (visible as invalid VIDs at the tail of single-chunk postings). std::vector nearbyPostings(HeadPrevTopK.size()); for (int i = 0; i < HeadPrevTopK.size(); i++) { @@ -1497,26 +1460,25 @@ namespace SPTAG::SPANN { if (reAssignVectorsTopK.find(vid) == reAssignVectorsTopK.end() && !m_versionMap->Deleted(vid) && m_versionMap->GetVersion(vid) == version) { m_stat.m_reAssignScanNum++; float dist = m_headIndex->ComputeDistance(HeadPrevTopKVec[i]->data(), vector); - if (CheckIsNeedReassign(newHeadsVec, vector, headVec, newHeadsDist[i], dist, false)) { + if (CheckIsNeedReassign(newHeadsVec, vector, headVector, newHeadsDist[i], dist, false)) { tryBatchReassign(vectorId, HeadPrevTopK[i]); reAssignVectorsTopK.insert(vid); } } } } - } // reassignReadOk } - // Batch Append: one Append batch per target head instead of one - // ReassignAsync per vector. For TiKV, fan these target heads out via - // the async append path so this split worker does not wait for each - // RPC serially. - if (!batchReassign.empty()) { - ErrorCode ret = AppendBatchAsync(p_exWorkSpace, batchReassign, "CollectReAssign"); - if (ret != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "CollectReAssign batch append failed for %zu target heads\n", - batchReassign.size()); + + // Batch Append: one Append call per target head instead of one ReassignAsync per vector + // Use reassignThreshold=0 so that if the posting overflows, it goes through + // SplitAsync (async) rather than synchronous Split, avoiding recursive deadlock: + // Split -> CollectReAssign -> Append -> Split -> CollectReAssign -> ... + ErrorCode ret = ErrorCode::Success; + if (m_opt->m_storage == Storage::TIKVIO) ret = BatchAppend(p_exWorkSpace, batchReassign, "CollectReAssign"); + else { + for (auto& kv : batchReassign) { + AppendAsync(kv.first, std::make_shared(kv.second), true); } } if (batchReassignCount > 0) { @@ -1536,7 +1498,7 @@ namespace SPTAG::SPANN { m_stat.m_splitReassignSampleCount.fetch_add(1, std::memory_order_relaxed); m_stat.m_splitReassignRecordSampleCount.fetch_add(1, std::memory_order_relaxed); m_stat.m_splitReassignTargetHeadSampleCount.fetch_add(1, std::memory_order_relaxed); - return ErrorCode::Success; + return ret; } bool RNGSelection(ExtraWorkSpace* p_exWorkSpace, std::vector& selections, ValueType* queryVector, int& replicaCount, SizeType checkHeadID = -1) @@ -1579,29 +1541,6 @@ namespace SPTAG::SPANN { return true; } - void InitWorkSpace(ExtraWorkSpace* p_exWorkSpace, bool clear = false) - { - if (clear) { - p_exWorkSpace->Clear(m_opt->m_searchInternalResultNum, (max(m_opt->m_postingPageLimit, m_opt->m_searchPostingPageLimit) + m_opt->m_bufferLength) << PageSizeEx, true, m_opt->m_enableDataCompression); - } - else { - p_exWorkSpace->Initialize(m_opt->m_maxCheck, m_opt->m_hashExp, max(m_opt->m_searchInternalResultNum, m_opt->m_reassignK), (max(m_opt->m_postingPageLimit, m_opt->m_searchPostingPageLimit) + m_opt->m_bufferLength) << PageSizeEx, true, m_opt->m_enableDataCompression); - int wid = 0; - if (m_freeWorkSpaceIds == nullptr || !m_freeWorkSpaceIds->try_pop(wid)) - { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "FreeWorkSpaceIds is not initalized or the workspace number is not enough! Please increase iothread number.\n"); - p_exWorkSpace->m_diskRequests[0].m_status = -1; - return; - } - for (auto & req : p_exWorkSpace->m_diskRequests) - { - req.m_status = wid; - } - p_exWorkSpace->m_callback = [m_freeWorkSpaceIds = m_freeWorkSpaceIds, wid] () { - if (m_freeWorkSpaceIds) m_freeWorkSpaceIds->push(wid); - }; - } - } ErrorCode Append(ExtraWorkSpace* p_exWorkSpace, SizeType headID, int appendNum, std::string& appendPosting, int reassignThreshold = 0) { @@ -1625,7 +1564,7 @@ namespace SPTAG::SPANN { if (m_versionMap->GetVersion(VID) == version) { // SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Head Miss To ReAssign: VID: %d, current version: %d\n", *(int*)(&appendPosting[idx]), version); m_stat.m_headMiss++; - ReassignAsync(vectorInfo, headID); + ReassignAsync(vectorInfo, headID, true); } // SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Head Miss Do Not To ReAssign: VID: %d, version: %d, current version: %d\n", *(int*)(&appendPosting[idx]), m_versionMap->GetVersion(*(int*)(&appendPosting[idx])), version); } @@ -1633,7 +1572,6 @@ namespace SPTAG::SPANN { } double appendIOSeconds = 0; int postingSize = 0; - bool splitPending = false; { //std::shared_lock lock(m_rwLocks[headID]); //ROCKSDB // [DIAG] measure lock wait time (suspect A: lock contention) @@ -1641,144 +1579,62 @@ namespace SPTAG::SPANN { std::unique_lock lock(m_rwLocks[headID]); //SPDK auto _lockAcq = std::chrono::high_resolution_clock::now(); uint64_t _lockWaitUs = std::chrono::duration_cast(_lockAcq - _lockBegin).count(); + IndexStats::HistAdd(m_stat.m_appendLockWaitUs, _lockWaitUs); + m_stat.m_appendLockWaitTotalUs.fetch_add(_lockWaitUs, std::memory_order_relaxed); + ErrorCode ret; if (!m_headIndex->ContainSample(headID, m_layer + 1)) { lock.unlock(); goto checkDeleted; } { - std::shared_lock lock(m_splitListLock); + std::shared_lock tmplock(m_splitListLock); auto it = m_splitList.find(headID); if (it != m_splitList.end()) { postingSize = it->second; - splitPending = true; } } - // For multi-chunk mode, also check the posting count cache/TiKV - // since m_splitList only has entries for postings pending split. - if (IsMultiChunk() && postingSize == 0) { - int cnt = GetCachedPostingCount(headID); - if (cnt < 0) { - // [FIX] Count is currently unknown (TiKV error). Aborting the - // RMW is far safer than calling AppendChunkAndUpdateCount with - // oldCount=0, which would PutChunkAndCount(newCount=appendNum) - // and corrupt the existing (larger) count value in TiKV. - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "Append: posting count unknown for headID=%lld; aborting RMW to avoid count corruption\n", - (std::int64_t)headID); - m_stat.m_appendGetFail.fetch_add(1, std::memory_order_relaxed); - return ErrorCode::Fail; - } - postingSize = cnt; - } - if (!splitPending && postingSize + appendNum > (m_postingSizeLimit + m_bufferSizeLimit)) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "After appending, the number of vectors in %lld exceeds the postingsize + buffersize (%d + %d)! Do split now...\n", (std::int64_t)headID, m_postingSizeLimit, m_bufferSizeLimit); - if (reassignThreshold == 0) { - // From CollectReAssign batch: schedule async split but proceed - // with the append below (don't retry — async split hasn't - // finished so retrying would spin-loop). - SplitAsync(headID, postingSize + appendNum); - } else { - ret = Split(p_exWorkSpace, headID, !m_opt->m_disableReassign, false); - if (ret != ErrorCode::Success) - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Split %lld failed!\n", (std::int64_t)headID); - lock.unlock(); - goto checkDeleted; - } + + if (postingSize + appendNum > (m_postingSizeLimit + m_bufferSizeLimit) && m_opt->m_storage == Storage::FILEIO) { + //SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "After appending, the number of vectors in %lld exceeds the postingsize + buffersize (%d + %d)! Do split now...\n", (std::int64_t)headID, m_postingSizeLimit, m_bufferSizeLimit); + ret = Split(p_exWorkSpace, headID, false); + if (ret != ErrorCode::Success) + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Split %lld failed!\n", (std::int64_t)headID); + lock.unlock(); + goto checkDeleted; } auto appendIOBegin = std::chrono::high_resolution_clock::now(); - if (IsMultiChunk()) { - { static std::atomic _logOnce{0}; if (_logOnce.fetch_add(1) == 0) SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "[PATH] Append using MULTI-CHUNK AppendChunk path\n"); } - // Multi-chunk path: write chunk + update count in one BatchPut RPC. - auto _mcBegin = std::chrono::high_resolution_clock::now(); - ret = AppendChunkAndUpdateCount(headID, appendPosting, appendNum, - postingSize, MaxTimeout, - &(p_exWorkSpace->m_diskRequests)); - auto _mcEnd = std::chrono::high_resolution_clock::now(); - uint64_t _mcUs = std::chrono::duration_cast(_mcEnd - _mcBegin).count(); - IndexStats::HistAdd(m_stat.m_mcAppendUs, _mcUs); - m_stat.m_mcAppendTotalUs.fetch_add(_mcUs, std::memory_order_relaxed); - m_stat.m_mcAppendSampleCount.fetch_add(1, std::memory_order_relaxed); - // Lock wait is path-agnostic; reuse the single-key histogram. - IndexStats::HistAdd(m_stat.m_appendLockWaitUs, _lockWaitUs); - m_stat.m_appendLockWaitTotalUs.fetch_add(_lockWaitUs, std::memory_order_relaxed); - if (ret != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "MultiChunkAppend failed for %lld!\n", (std::int64_t)headID); - return ret; - } - postingSize = (postingSize + appendNum) * m_vectorInfoSize; - } else { - { static std::atomic _logOnce{0}; if (_logOnce.fetch_add(1) == 0) SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "[PATH] Append using SINGLE-KEY Get+Put path (no multi-chunk)\n"); } - std::string fullPosting; - // [DIAG] measure Get latency (suspect B/C: RMW read amplification + grpc) - auto _getBegin = std::chrono::high_resolution_clock::now(); - auto getRet = db->Get(DBKey(headID), &fullPosting, MaxTimeout, &(p_exWorkSpace->m_diskRequests)); - auto _getEnd = std::chrono::high_resolution_clock::now(); - uint64_t _getUs = std::chrono::duration_cast(_getEnd - _getBegin).count(); - // [FIX] Only treat "key absent" (NotFound) as legitimately empty - // posting (e.g. first write to this head). A real RPC/region - // failure (Fail) must NOT be silently turned into an empty - // posting, otherwise the subsequent Put would overwrite the - // existing TiKV value with only the new appendPosting, - // permanently losing every vector previously stored under this - // head. - if (getRet == ErrorCode::Key_NotFound) { - fullPosting.clear(); - } else if (getRet != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "Append: TiKV Get failed for headID=%lld (err=%d); aborting RMW to avoid data loss\n", - (std::int64_t)headID, (int)getRet); - m_stat.m_appendGetFail.fetch_add(1, std::memory_order_relaxed); - return getRet; - } - // [DIAG] capture pre-append size BEFORE we mutate fullPosting - uint64_t _preBytes = (uint64_t)fullPosting.size(); - // Diagnostic: detect stale/misaligned bytes in TiKV (e.g. residue - // from a previous run with different m_vectorInfoSize, or a prior - // multi-chunk layout sharing the same key prefix). - if (getRet == ErrorCode::Success && - (fullPosting.size() % m_vectorInfoSize) != 0) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "Append: stale-aligned posting in TiKV headID=%lld size=%zu mod=%zu (m_vectorInfoSize=%d)\n", - (std::int64_t)headID, fullPosting.size(), - fullPosting.size() % (size_t)m_vectorInfoSize, - m_vectorInfoSize); - } - fullPosting.append(appendPosting); - postingSize = static_cast(fullPosting.size()); - // [DIAG] measure Put latency + posting size - auto _putBegin = std::chrono::high_resolution_clock::now(); - if ((ret = db->Put(DBKey(headID), fullPosting, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Merge failed for %lld! Posting Size:%d, limit: %d\n", (std::int64_t)headID, postingSize, m_postingSizeLimit); - GetDBStats(); - return ret; - } - auto _putEnd = std::chrono::high_resolution_clock::now(); - uint64_t _putUs = std::chrono::duration_cast(_putEnd - _putBegin).count(); - // [DIAG] record into stat histograms - IndexStats::HistAdd(m_stat.m_appendLockWaitUs, _lockWaitUs); - IndexStats::HistAdd(m_stat.m_appendGetUs, _getUs); - IndexStats::HistAdd(m_stat.m_appendPutUs, _putUs); - IndexStats::HistAdd(m_stat.m_appendPostingBytes, (uint64_t)fullPosting.size()); - m_stat.m_appendLockWaitTotalUs.fetch_add(_lockWaitUs, std::memory_order_relaxed); - m_stat.m_appendGetTotalUs.fetch_add(_getUs, std::memory_order_relaxed); - m_stat.m_appendPutTotalUs.fetch_add(_putUs, std::memory_order_relaxed); - m_stat.m_appendPostingBytesTotal.fetch_add((uint64_t)fullPosting.size(), std::memory_order_relaxed); - m_stat.m_appendRmwSampleCount.fetch_add(1, std::memory_order_relaxed); - // [DIAG] pre-append size + near-threshold tag (>=80% of split limit) - IndexStats::HistAdd(m_stat.m_appendPreBytes, _preBytes); - m_stat.m_appendPreBytesTotal.fetch_add(_preBytes, std::memory_order_relaxed); - { - uint64_t _limitBytes = (uint64_t)m_postingSizeLimit * (uint64_t)m_vectorInfoSize; - if (_limitBytes && _preBytes * 5 >= _limitBytes * 4) { - m_stat.m_appendNearThreshold.fetch_add(1, std::memory_order_relaxed); + if ((ret = db->Merge( + DBKey(headID), appendPosting, MaxTimeout, &(p_exWorkSpace->m_diskRequests), postingSize)) != ErrorCode::Success) + { + if (ret == ErrorCode::Posting_OverFlow) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Merge failed:Posting overflow when appending to %lld! Do split and then retry...\n", (std::int64_t)headID); + ret = Split(p_exWorkSpace, headID, false); + if (ret != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Split %lld failed!\n", (std::int64_t)headID); + return ret; } + lock.unlock(); + goto checkDeleted; } + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Merge failed for %lld! Posting Size:%d, limit: %d\n", (std::int64_t)headID, postingSize, m_postingSizeLimit); + GetDBStats(); + return ret; } auto appendIOEnd = std::chrono::high_resolution_clock::now(); appendIOSeconds = std::chrono::duration_cast(appendIOEnd - appendIOBegin).count(); + if (postingSize % m_vectorInfoSize != 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "Append: stale-aligned posting in TiKV headID=%lld size=%zu mod=%zu (m_vectorInfoSize=%d)\n", + (std::int64_t)headID, postingSize, + postingSize % m_vectorInfoSize, + m_vectorInfoSize); + } + IndexStats::HistAdd(m_stat.m_appendPostingBytes, (uint64_t)postingSize); + m_stat.m_appendPostingBytesTotal.fetch_add((uint64_t)postingSize, std::memory_order_relaxed); + m_stat.m_appendRmwSampleCount.fetch_add(1, std::memory_order_relaxed); postingSize /= m_vectorInfoSize; } if (postingSize > (m_postingSizeLimit + reassignThreshold)) { @@ -1790,7 +1646,7 @@ namespace SPTAG::SPANN { // } m_stat.m_appendTriggeredSplit.fetch_add(1, std::memory_order_relaxed); if (!reassignThreshold) SplitAsync(headID, postingSize); - else Split(p_exWorkSpace, headID, !m_opt->m_disableReassign); + else Split(p_exWorkSpace, headID); } auto appendEnd = std::chrono::high_resolution_clock::now(); double elapsedMSeconds = std::chrono::duration_cast(appendEnd - appendBegin).count(); @@ -1806,6 +1662,94 @@ namespace SPTAG::SPANN { return ErrorCode::Success; } + ErrorCode BatchAppend(ExtraWorkSpace* p_exWorkSpace, std::unordered_map& headAppends, const char* caller) + { + if (headAppends.empty()) return ErrorCode::Success; + + auto appendBegin = std::chrono::high_resolution_clock::now(); + + std::vector keys; + std::vector values; + + std::vector sortedHeadIDs; + sortedHeadIDs.reserve(headAppends.size()); + for (const auto& kv : headAppends) sortedHeadIDs.push_back(kv.first); + std::sort(sortedHeadIDs.begin(), sortedHeadIDs.end()); + + std::vector> heldLocks; + heldLocks.reserve(sortedHeadIDs.size()); + + for (SizeType headID : sortedHeadIDs) + { + auto appendIt = headAppends.find(headID); + if (appendIt == headAppends.end()) continue; + + std::unique_lock headLock(m_rwLocks[headID]); + + if (!m_headIndex->ContainSample(headID, m_layer + 1)) { + headLock.unlock(); + for (std::uint8_t* ptr = (std::uint8_t*)(appendIt->second.data()); + ptr < (std::uint8_t*)(appendIt->second.data() + appendIt->second.size()); + ptr += m_vectorInfoSize) { + SizeType VID = *(SizeType*)(ptr); + uint8_t version = *(uint8_t*)(ptr + sizeof(SizeType)); + if (m_versionMap->GetVersion(VID) == version) { + m_stat.m_headMiss++; + ReassignAsync(std::make_shared((char*)ptr, m_vectorInfoSize), headID, true); + } + } + continue; + } + + keys.push_back(headID); + values.push_back(appendIt->second); + heldLocks.emplace_back(std::move(headLock)); + } + + if (keys.empty()) return ErrorCode::Success; + + std::vector postingSizes(keys.size(), 0); + auto appendIOBegin = std::chrono::high_resolution_clock::now(); + ErrorCode ret; + auto dbkeys = DBKeys(keys); + if ((ret = db->MultiMerge( + *dbkeys, values, MaxTimeout, &(p_exWorkSpace->m_diskRequests), postingSizes)) != ErrorCode::Success) + { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "MultiMerge failed!\n"); + GetDBStats(); + return ret; + } + auto appendIOEnd = std::chrono::high_resolution_clock::now(); + auto appendIOSeconds = std::chrono::duration_cast(appendIOEnd - appendIOBegin).count(); + + for (size_t i = 0; i < keys.size(); i++) { + heldLocks[i].unlock(); + int postingSize = postingSizes[i]; + if (postingSize % m_vectorInfoSize != 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "Append: stale-aligned posting in TiKV headID=%lld size=%zu mod=%zu (m_vectorInfoSize=%d)\n", + (std::int64_t)keys[i], postingSize, + postingSize % m_vectorInfoSize, + m_vectorInfoSize); + } + IndexStats::HistAdd(m_stat.m_appendPostingBytes, (uint64_t)postingSize); + m_stat.m_appendPostingBytesTotal.fetch_add((uint64_t)postingSize, std::memory_order_relaxed); + m_stat.m_appendRmwSampleCount.fetch_add(1, std::memory_order_relaxed); + postingSize /= m_vectorInfoSize; + if (postingSize > m_postingSizeLimit) { + m_stat.m_appendTriggeredSplit.fetch_add(1, std::memory_order_relaxed); + SplitAsync(keys[i], postingSize); + } + auto appendEnd = std::chrono::high_resolution_clock::now(); + double elapsedMSeconds = std::chrono::duration_cast(appendEnd - appendBegin).count(); + m_totalAppendCount++; + m_stat.m_appendTaskNum++; + m_stat.m_appendIOCost += appendIOSeconds; + m_stat.m_appendCost += elapsedMSeconds; + } + return ErrorCode::Success; + } + ErrorCode Reassign(ExtraWorkSpace* p_exWorkSpace, std::shared_ptr vectorInfo, SizeType headPrev) { SizeType VID = *((SizeType*)vectorInfo->c_str()); @@ -1877,10 +1821,7 @@ namespace SPTAG::SPANN { m_versionMap->Load(versionmapPath, m_opt->m_datasetRowsInBlock, m_opt->m_datasetCapacity); SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Recovery: Current vector num: %d.\n", m_versionMap->Count()); } - else if (m_opt->m_storage == Storage::ROCKSDBIO) { - m_versionMap->Load(versionmapPath, m_opt->m_datasetRowsInBlock, m_opt->m_datasetCapacity); - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Current vector num: %d.\n", m_versionMap->Count()); - } else if (m_opt->m_storage == Storage::TIKVIO) { + else if (m_opt->m_storage == Storage::ROCKSDBIO || m_opt->m_storage == Storage::TIKVIO) { m_versionMap->Load(versionmapPath, m_opt->m_datasetRowsInBlock, m_opt->m_datasetCapacity); SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Current vector num: %d.\n", m_versionMap->Count()); } else if (m_opt->m_storage == Storage::SPDKIO || m_opt->m_storage == Storage::FILEIO) { @@ -2028,9 +1969,12 @@ namespace SPTAG::SPANN { bool p_checkVersionMap) override { // Use coprocessor search if enabled and storage is TiKV +#ifdef TIKV if (m_opt->m_useCoprocessorSearch && m_opt->m_storage == Storage::TIKVIO) { return SearchIndexWithCoprocessor(p_exWorkSpace, p_queryResults, p_stats, truth, found, p_checkVersionMap); } +#endif + if (p_stats) p_stats->m_exSetUpLatency = 0; auto layerTotalStart = std::chrono::high_resolution_clock::now(); @@ -2049,21 +1993,7 @@ namespace SPTAG::SPANN { else remainLimit = m_hardLatencyLimit; auto readStart = std::chrono::high_resolution_clock::now(); - if (m_opt->m_useMultiChunkPosting && m_opt->m_storage == Storage::TIKVIO) { - { static std::atomic _logOnce{0}; if (_logOnce.fetch_add(1) == 0) SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "[PATH] SearchIndex using MULTI-CHUNK scan path\n"); } - // Multi-chunk: scan all chunks per posting and concatenate - auto* tikvDB = dynamic_cast(db.get()); - if (!tikvDB) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "[SearchIndex] db is not TiKVIO for multi-chunk!\n"); - return ErrorCode::DiskIOFail; - } - auto dbKeys = DBKeys(p_exWorkSpace->m_postingIDs); - if (tikvDB->MultiScanPostings(*dbKeys, p_exWorkSpace->m_pageBuffers, remainLimit) != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "[SearchIndex] multi-chunk scan postings fail!\n"); - return ErrorCode::DiskIOFail; - } - } else { - { static std::atomic _logOnce{0}; if (_logOnce.fetch_add(1) == 0) SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "[PATH] SearchIndex using SINGLE-KEY Get path (no multi-chunk)\n"); } + { auto keys = DBKeys(p_exWorkSpace->m_postingIDs); if (db->MultiGet(*keys, p_exWorkSpace->m_pageBuffers, remainLimit, &(p_exWorkSpace->m_diskRequests)) != ErrorCode::Success) { @@ -2190,6 +2120,7 @@ namespace SPTAG::SPANN { return ErrorCode::Success; } +#ifdef TIKV // Coprocessor-based search: push distance computation into TiKV. // Instead of fetching raw posting data, sends the query vector and // posting keys to TiKV, which reads postings locally, computes L2 @@ -2307,7 +2238,7 @@ namespace SPTAG::SPANN { queryResults.SetScanned(listElements); return ErrorCode::Success; } - +#endif virtual ErrorCode SearchIndexWithoutParsing(ExtraWorkSpace* p_exWorkSpace) { int retry = 0; @@ -2762,6 +2693,17 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to save head index!\n"); return false; } + std::error_code ec; + std::string prevHeadVectorFile = m_opt->m_indexDirectory + FolderSep + m_opt->m_headIndexFolder + FolderSep + p_headIndex->GetParameter("VectorFilePath"); + std::string curHeadVectorFile = m_opt->m_indexDirectory + FolderSep + m_opt->m_headVectorFile; + std::filesystem::copy_file(prevHeadVectorFile, curHeadVectorFile, std::filesystem::copy_options::overwrite_existing, ec); + if (ec) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Could not copy previous layer headVectorFile %s to %s: %s\n", + prevHeadVectorFile.c_str(), curHeadVectorFile.c_str(), ec.message().c_str()); + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Update headVectorFile from %s to %s for layer %d\n", + prevHeadVectorFile.c_str(), curHeadVectorFile.c_str(), m_layer); + } } p_headToLocal.Save(m_opt->m_indexDirectory + FolderSep + m_opt->m_headIDFile); @@ -2825,13 +2767,14 @@ namespace SPTAG::SPANN { } ErrorCode tmp; - if ((tmp = PutPostingToDB(postingID, postinglist, MaxTimeout, &(workSpace.m_diskRequests))) != + if ((tmp = db->Put(DBKey(postingID), postinglist, MaxTimeout, &(workSpace.m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "[WriteDB] Put %lld fail!\n", (std::int64_t)index); ret = tmp; return; } + CheckCentroid(postingID, postinglist, "WriteDownAllPostingToDB"); } else { @@ -2869,7 +2812,16 @@ namespace SPTAG::SPANN { } } - return AppendBatchAsync(p_exWorkSpace, headAppends, "AddIndex"); + // Phase 2: Batch append to each headID (one Merge per head instead of per vector) + if (m_opt->m_storage == Storage::TIKVIO) return BatchAppend(p_exWorkSpace, headAppends, "AddIndex"); + + for (auto& [headID, posting] : headAppends) { + int appendNum = static_cast(posting.size() / m_vectorInfoSize); + ErrorCode ret; + if ((ret = Append(p_exWorkSpace, headID, appendNum, posting)) != ErrorCode::Success) + return ret; + } + return ErrorCode::Success; } ErrorCode DeleteIndex(SizeType p_id) override { @@ -2914,12 +2866,12 @@ namespace SPTAG::SPANN { double avgSplitMs = completed > 0 ? (m_totalSplitTimeUs.load() / 1000.0 / completed) : 0; double maxSplitMs = m_maxSplitTimeUs.load() / 1000.0; SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "layer %d pending queue:%zu split:%zu merge:%zu reassign:%zu running:%u | " + "layer %d pending queue:%zu split:%zu merge:%zu append:%zu reassign:%zu running:%u | " "total_submitted split:%zu merge:%zu reassign:%zu append:%zu | " "total_completed split:%zu merge:%zu reassign:%zu | " "split_latency avg:%.1fms max:%.1fms\n", m_layer, totalJobs, m_splitJobsInFlight.load(), - m_mergeJobsInFlight.load(), m_reassignJobsInFlight.load(), runningJobs, + m_mergeJobsInFlight.load(), m_appendJobsInFlight.load(), m_reassignJobsInFlight.load(), runningJobs, m_totalSplitSubmitted.load(), m_totalMergeSubmitted.load(), m_totalReassignSubmitted.load(), m_totalAppendCount.load(), m_totalSplitCompleted.load(), m_totalMergeCompleted.load(), m_totalReassignCompleted.load(), avgSplitMs, maxSplitMs); @@ -3012,10 +2964,7 @@ namespace SPTAG::SPANN { (unsigned long)mcGM, (m_stat.m_mcGetCountCacheHit.load() + mcGM) ? (double)mcGM / (m_stat.m_mcGetCountCacheHit.load() + mcGM) : 0.0); - if (IsTiKV()) { - auto* tikv = GetTiKVDB(); - if (tikv) tikv->LogAsyncWaitStatsAndReset(m_layer); - } + db->LogAsyncWaitStatsAndReset(m_layer); } } m_allDonePrinted = true; @@ -3034,12 +2983,12 @@ namespace SPTAG::SPANN { size_t totalJobs = m_splitThreadPool ? m_splitThreadPool->jobsize() : 0; // if (!ShouldLogProgress(totalJobs)) return; SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "layer %d pending queue:%zu split:%zu merge:%zu reassign:%zu running:%u | " + "layer %d pending queue:%zu split:%zu merge:%zu append:%zu reassign:%zu running:%u | " "total_submitted split:%zu merge:%zu reassign:%zu append:%zu | " "total_completed split:%zu merge:%zu reassign:%zu | " "split_latency avg:%.1fms max:%.1fms\n", m_layer, totalJobs, - m_splitJobsInFlight.load(), m_mergeJobsInFlight.load(), m_reassignJobsInFlight.load(), + m_splitJobsInFlight.load(), m_mergeJobsInFlight.load(), m_appendJobsInFlight.load(), m_reassignJobsInFlight.load(), m_splitThreadPool ? static_cast(m_splitThreadPool->runningJobs()) : 0, m_totalSplitSubmitted.load(), m_totalMergeSubmitted.load(), m_totalReassignSubmitted.load(), m_totalAppendCount.load(), m_totalSplitCompleted.load(), m_totalMergeCompleted.load(), m_totalReassignCompleted.load(), @@ -3075,14 +3024,15 @@ namespace SPTAG::SPANN { ErrorCode GetWritePosting(ExtraWorkSpace* p_exWorkSpace, SizeType pid, std::string& posting, bool write = false) override { ErrorCode ret; if (write) { - if ((ret = PutPostingToDB(pid, posting, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) + if ((ret = db->Put(DBKey(pid), posting, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "[GetWritePosting] Put fail!\n"); return ret; - } + } + CheckCentroid(pid, posting, "GetWritePosting"); // SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "PostingSize: %d\n", m_postingSizes.GetSize(pid)); } else { - if ((ret = GetPostingFromDB(pid, &posting, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) + if ((ret = db->Get(DBKey(pid), &posting, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "[GetWritePosting] Get fail!\n"); return ret; @@ -3143,650 +3093,6 @@ namespace SPTAG::SPANN { return keys; } - // Multi-chunk aware helpers: abstract single-key vs chunked access. - // When UseMultiChunkPosting is on and storage is TiKV, use Scan/PutBase/DeletePosting. - // Otherwise, fall back to the standard KeyValueIO Get/Put/Delete. - - inline bool IsMultiChunk() const { - return m_opt->m_useMultiChunkPosting && m_opt->m_storage == Storage::TIKVIO; - } - - inline bool IsTiKV() const { - return m_opt->m_storage == Storage::TIKVIO; - } - - inline TiKVIO* GetTiKVDB() const { - return dynamic_cast(db.get()); - } - - // Read a full posting from DB (Scan for multi-chunk, Get for single-key). - ErrorCode GetPostingFromDB(SizeType headID, std::string* posting, - const std::chrono::microseconds& timeout, - std::vector* reqs) { - if (IsMultiChunk()) { - return this->GetTiKVDB()->ScanPosting(DBKey(headID), posting, timeout); - } - return db->Get(DBKey(headID), posting, timeout, reqs); - } - - // Write a full posting to DB (DeletePosting+PutBaseChunk for multi-chunk, Put for single-key). - // This is a compacting write: replaces all chunks with a single base chunk. - // Also updates the posting count key and local cache. - ErrorCode PutPostingToDB(SizeType headID, const std::string& posting, - const std::chrono::microseconds& timeout, - std::vector* reqs) { - if (IsMultiChunk()) { - auto* tikv = this->GetTiKVDB(); - auto _t0 = std::chrono::high_resolution_clock::now(); - auto delRet = tikv->DeletePosting(DBKey(headID)); - auto _t1 = std::chrono::high_resolution_clock::now(); - uint64_t _delUs = std::chrono::duration_cast(_t1 - _t0).count(); - IndexStats::HistAdd(m_stat.m_mcSplitDelUs, _delUs); - m_stat.m_mcSplitDelTotalUs.fetch_add(_delUs, std::memory_order_relaxed); - if (delRet != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "PutPostingToDB: DeletePosting failed for headID %d\n", headID); - return delRet; - } - // Merge PutBaseChunk + SetPostingCount into a single RawBatchPut RPC - // (saves one network round trip per split commit). - int count = static_cast(posting.size() / m_vectorInfoSize); - auto ret = tikv->PutBaseChunkAndCount(DBKey(headID), posting, count, timeout, reqs); - auto _t2 = std::chrono::high_resolution_clock::now(); - uint64_t _putUs = std::chrono::duration_cast(_t2 - _t1).count(); - IndexStats::HistAdd(m_stat.m_mcSplitPutBaseUs, _putUs); - m_stat.m_mcSplitPutBaseTotalUs.fetch_add(_putUs, std::memory_order_relaxed); - // Histogram for SetPostingCount is now folded into PutBaseUs; - // record 0 to keep series aligned with SampleCount and avoid - // re-tooling downstream dashboards. - IndexStats::HistAdd(m_stat.m_mcSplitSetCountUs, 0); - m_stat.m_mcSplitWriteSampleCount.fetch_add(1, std::memory_order_relaxed); - if (ret != ErrorCode::Success) { - // [FIX] If the merged write failed, do NOT update the local - // cache: doing so would create a divergence where the cache - // reports the new count while TiKV still holds the old value. - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "PutPostingToDB: PutBaseChunkAndCount failed for headID %d\n", headID); - if (m_postingCountCache) m_postingCountCache->Remove(DBKey(headID)); - return ret; - } - if (m_postingCountCache) m_postingCountCache->Put(DBKey(headID), count); - return ErrorCode::Success; - } - return db->Put(DBKey(headID), posting, timeout, reqs); - } - - // Delete a posting from DB (DeletePosting for multi-chunk, Delete for single-key). - // Also deletes the posting count key and invalidates local cache. - ErrorCode DeletePostingFromDB(SizeType headID) { - if (IsMultiChunk()) { - auto* tikv = this->GetTiKVDB(); - auto countRet = tikv->DeletePostingCount(DBKey(headID)); - if (countRet != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "DeletePostingFromDB: DeletePostingCount failed for headID %d\n", headID); - } - if (m_postingCountCache) m_postingCountCache->Remove(DBKey(headID)); - return tikv->DeletePosting(DBKey(headID)); - } - return db->Delete(DBKey(headID)); - } - - // Get the posting vector count, using local cache with TiKV fallback. - // Returns count >= 0 on success (including 0 for legitimately-empty posting), - // or -1 if the count is currently unknown (TiKV error). Callers MUST treat - // -1 as "do not proceed with operations that depend on count" -- in - // particular, the Append RMW path must NOT pass an unknown count to - // AppendChunkAndUpdateCount, which would write - // newCount = 0 + appendNum - // and overwrite the (correct, larger) count value already stored in TiKV. - int GetCachedPostingCount(SizeType headID) { - if (!m_postingCountCache) return 0; - SizeType dbKey = DBKey(headID); - auto [count, hit] = m_postingCountCache->Get(dbKey); - if (hit) { - m_stat.m_mcGetCountCacheHit.fetch_add(1, std::memory_order_relaxed); - return count; - } - // Cache miss: fetch from TiKV - auto* tikv = this->GetTiKVDB(); - if (!tikv) return 0; - auto _gcBegin = std::chrono::high_resolution_clock::now(); - count = tikv->GetPostingCount(dbKey, std::chrono::microseconds(5000000)); - auto _gcEnd = std::chrono::high_resolution_clock::now(); - uint64_t _gcUs = std::chrono::duration_cast(_gcEnd - _gcBegin).count(); - IndexStats::HistAdd(m_stat.m_mcGetCountMissUs, _gcUs); - m_stat.m_mcGetCountMissTotalUs.fetch_add(_gcUs, std::memory_order_relaxed); - m_stat.m_mcGetCountCacheMiss.fetch_add(1, std::memory_order_relaxed); - if (count < 0) { - // [FIX] Distinguish a true TiKV error from a legitimate 0 count. - // Returning 0 here would mislead the Append RMW into writing - // newCount=appendNum and silently truncating the posting count - // in TiKV (data drift). Return -1 sentinel; callers handle it. - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "GetCachedPostingCount: TiKV error for headID %d, returning -1 (unknown)\n", headID); - return -1; - } - m_postingCountCache->Put(dbKey, count); - return count; - } - - // Update posting count after appending vectors. - // Writes to TiKV via BatchPut (chunk + count in one RPC) and updates local cache. - ErrorCode AppendChunkAndUpdateCount(SizeType headID, const std::string& appendPosting, - int appendNum, int oldCount, - const std::chrono::microseconds& timeout, - std::vector* reqs) { - auto* tikv = this->GetTiKVDB(); - if (!tikv) return ErrorCode::Fail; - int newCount = oldCount + appendNum; - auto ret = tikv->PutChunkAndCount(DBKey(headID), appendPosting, newCount, timeout, reqs); - if (ret == ErrorCode::Success && m_postingCountCache) { - m_postingCountCache->Put(DBKey(headID), newCount); - } - return ret; - } - - // Truly-async fan-out for AddIndex Phase 2 multi-chunk path. - // - // Goal: a single insert worker calling AddIndex with N (12-30) head - // appends issues N TiKV BatchPut RPCs all at once into a shared - // CompletionQueue (one pump thread for the whole process), then blocks - // ONCE on the wait-group until every RPC has signalled. No std::async, - // no per-RPC OS thread. This converts the worker's RPC time from - // O(N * RPC_latency) into O(RPC_latency). - // - // Per-head logic mirrors Append(): - // 1. If head was deleted: ReassignAsync each vector, skip. - // 2. Take write locks in global shard order. AppendBatchAsync splits - // same-shard collisions into separate rounds before reaching here. - // 3. Read posting count from cache (or m_splitList for pending splits, - // or TiKV on miss). If unknown (TiKV error), abort this head. - // 4. If newCount would overflow the posting+buffer limit, schedule - // SplitAsync (same as Append's reassignThreshold==0 branch) and - // proceed with the append anyway. - // 5. Build (chunkKey, chunkValue, countKey, countValue) and submit - // AsyncAppendChunkAndUpdateCount on the shared cq. - // - // After Wait(): - // * For each head whose async RPC failed (region_error / RPC error / - // TiKV error), retry synchronously via the existing - // AppendChunkAndUpdateCount path (which has its own region-discovery - // retry loop). - // * For each head that succeeded, update the local count cache. - // * RAII-release every per-head write lock as the Pending vector - // destructs. - ErrorCode AddIndexAsyncMultiChunk(ExtraWorkSpace* p_exWorkSpace, - std::unordered_map& headAppends, - const char* caller) - { - auto* tikv = this->GetTiKVDB(); - if (!tikv) return ErrorCode::Fail; - - struct Pending { - SizeType headID = 0; - std::string* posting = nullptr; - int appendNum = 0; - int oldCount = 0; - bool splitPending = false; - bool countKnown = false; - bool countOk = true; - std::unique_lock lock; - }; - - std::vector pendings; - pendings.reserve(headAppends.size()); - std::vector countMissPendingIndices; - std::vector countMissDBKeys; - - auto _phase2Begin = std::chrono::high_resolution_clock::now(); - - std::vector orderedHeads; - orderedHeads.reserve(headAppends.size()); - for (auto& entry : headAppends) orderedHeads.push_back(entry.first); - std::sort(orderedHeads.begin(), orderedHeads.end(), [this](SizeType a, SizeType b) { - unsigned sa = m_rwLocks.hash_func(a); - unsigned sb = m_rwLocks.hash_func(b); - return sa == sb ? a < b : sa < sb; - }); - - // ---- Pass 1: lock + decide per head (sync, but cheap: cache hits) ---- - for (SizeType hid : orderedHeads) { - auto it = headAppends.find(hid); - if (it == headAppends.end()) continue; - auto& posting = it->second; - Pending p; - p.headID = hid; - p.posting = &posting; - p.appendNum = static_cast(posting.size() / m_vectorInfoSize); - - // Deleted-head reassign path (mirrors Append()'s checkDeleted block). - if (!m_headIndex->ContainSample(p.headID, m_layer + 1)) { - for (int i = 0; i < p.appendNum; i++) { - uint32_t idx = i * m_vectorInfoSize; - SizeType VID = *(SizeType*)(&(*p.posting)[idx]); - uint8_t version = *(uint8_t*)(&(*p.posting)[idx + sizeof(SizeType)]); - auto vectorInfo = std::make_shared( - p.posting->c_str() + idx, m_vectorInfoSize); - if (m_versionMap->GetVersion(VID) == version) { - m_stat.m_headMiss++; - ReassignAsync(vectorInfo, p.headID); - } - } - continue; - } - - p.lock = std::unique_lock(m_rwLocks[p.headID]); - - // Re-check after lock (head could have been split/deleted between - // ContainSample above and the lock acquisition). - if (!m_headIndex->ContainSample(p.headID, m_layer + 1)) { - p.lock.unlock(); - // Re-run reassign logic: vectors that targeted this head must - // be re-routed instead of silently dropped. - for (int i = 0; i < p.appendNum; i++) { - uint32_t idx = i * m_vectorInfoSize; - SizeType VID = *(SizeType*)(&(*p.posting)[idx]); - uint8_t version = *(uint8_t*)(&(*p.posting)[idx + sizeof(SizeType)]); - auto vectorInfo = std::make_shared( - p.posting->c_str() + idx, m_vectorInfoSize); - if (m_versionMap->GetVersion(VID) == version) { - m_stat.m_headMiss++; - ReassignAsync(vectorInfo, p.headID); - } - } - continue; - } - - int oldCount = 0; - bool splitPending = false; - { - std::shared_lock sl(m_splitListLock); - auto it = m_splitList.find(p.headID); - if (it != m_splitList.end()) { - oldCount = it->second; - splitPending = true; - } - } - if (oldCount == 0) { - SizeType dbKey = DBKey(p.headID); - auto cached = m_postingCountCache ? m_postingCountCache->Get(dbKey) : std::make_pair(0, false); - if (cached.second) { - m_stat.m_mcGetCountCacheHit.fetch_add(1, std::memory_order_relaxed); - oldCount = cached.first; - p.countKnown = true; - } else { - p.countKnown = false; - } - } else { - p.countKnown = true; - } - - p.oldCount = oldCount; - p.splitPending = splitPending; - pendings.push_back(std::move(p)); - if (!pendings.back().countKnown) { - countMissPendingIndices.push_back(pendings.size() - 1); - countMissDBKeys.push_back(DBKey(pendings.back().headID)); - } - } - - if (pendings.empty()) return ErrorCode::Success; - - if (!countMissPendingIndices.empty()) { - std::vector fetchedCounts; - auto _countBegin = std::chrono::high_resolution_clock::now(); - tikv->AsyncGetPostingCounts(countMissDBKeys, &fetchedCounts, - std::chrono::microseconds(5000000)); - auto _countEnd = std::chrono::high_resolution_clock::now(); - uint64_t _countUs = std::chrono::duration_cast(_countEnd - _countBegin).count(); - uint64_t _perMissUs = _countUs / static_cast(std::max(countMissPendingIndices.size(), 1)); - for (size_t i = 0; i < countMissPendingIndices.size(); i++) { - size_t pendingIdx = countMissPendingIndices[i]; - auto& p = pendings[pendingIdx]; - m_stat.m_mcGetCountCacheMiss.fetch_add(1, std::memory_order_relaxed); - IndexStats::HistAdd(m_stat.m_mcGetCountMissUs, _perMissUs); - m_stat.m_mcGetCountMissTotalUs.fetch_add(_perMissUs, std::memory_order_relaxed); - if (i < fetchedCounts.size() && fetchedCounts[i] >= 0) { - p.oldCount = fetchedCounts[i]; - p.countKnown = true; - if (m_postingCountCache) m_postingCountCache->Put(countMissDBKeys[i], p.oldCount); - } else { - p.countOk = false; - m_stat.m_appendGetFail.fetch_add(1, std::memory_order_relaxed); - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "AddIndexAsyncMultiChunk: posting count unknown for headID=%lld; skipping\n", - (std::int64_t)p.headID); - } - } - } - - std::vector readyPendings; - readyPendings.reserve(pendings.size()); - for (auto& p : pendings) { - if (!p.countOk || !p.countKnown) { - p.lock.unlock(); - continue; - } - if (!p.splitPending && - p.oldCount + p.appendNum > (m_postingSizeLimit + m_bufferSizeLimit)) { - SplitAsync(p.headID, p.oldCount + p.appendNum); - } - readyPendings.push_back(std::move(p)); - } - pendings = std::move(readyPendings); - - if (pendings.empty()) return ErrorCode::Success; - - // ---- Pass 2: fire all async BatchPuts into the cq ---- - // result_slots must NOT move once a tag holds a pointer into it, - // so reserve and use indices. - auto batch = std::make_shared(); - batch->Add(static_cast(pendings.size())); - - // std::vector> is non-resizable but in-place - // constructible to size N with zero-initialization. - std::vector> okFlags(pendings.size()); - for (auto& f : okFlags) f.store(0, std::memory_order_relaxed); - - for (size_t i = 0; i < pendings.size(); i++) { - auto& p = pendings[i]; - int newCount = p.oldCount + p.appendNum; - tikv->AsyncAppendChunkAndUpdateCount( - DBKey(p.headID), *p.posting, newCount, - batch, &okFlags[i], MaxTimeout); - } - - // ---- Pass 3: single thread blocks on wait-group ---- - auto _waitBegin = std::chrono::high_resolution_clock::now(); - batch->Wait(); - auto _waitEnd = std::chrono::high_resolution_clock::now(); - bool fromCollectReAssign = caller != nullptr && std::string(caller) == "CollectReAssign"; - tikv->RecordAsyncWait(fromCollectReAssign ? - TiKVIO::AsyncWaitKind::CollectReAssignMultiChunk : - TiKVIO::AsyncWaitKind::AddIndexMultiChunk, - pendings.size(), - static_cast(std::chrono::duration_cast( - _waitEnd - _waitBegin).count())); - - // ---- Pass 4: process results, sync-retry failures ---- - ErrorCode firstErr = ErrorCode::Success; - int retryCount = 0; - for (size_t i = 0; i < pendings.size(); i++) { - auto& p = pendings[i]; - int newCount = p.oldCount + p.appendNum; - if (okFlags[i].load(std::memory_order_acquire) == 1) { - if (m_postingCountCache) m_postingCountCache->Put(DBKey(p.headID), newCount); - } else { - // Sync retry preserves the existing region-cache invalidation - // + retry semantics already battle-tested in PutChunkAndCount. - retryCount++; - auto ret = AppendChunkAndUpdateCount( - p.headID, *p.posting, p.appendNum, p.oldCount, - MaxTimeout, &(p_exWorkSpace->m_diskRequests)); - if (ret != ErrorCode::Success && firstErr == ErrorCode::Success) { - firstErr = ret; - } - } - // Lock released by Pending dtor on scope exit. - } - - // Histogram: total Phase-2 wall time + sync-retry count, for sizing - // future tuning of the cq pump and async batch size. - uint64_t _totalUs = std::chrono::duration_cast( - _waitEnd - _phase2Begin).count(); - IndexStats::HistAdd(m_stat.m_mcAppendUs, _totalUs); - m_stat.m_mcAppendTotalUs.fetch_add(_totalUs, std::memory_order_relaxed); - m_stat.m_mcAppendSampleCount.fetch_add(1, std::memory_order_relaxed); - if (retryCount > 0) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "AddIndexAsyncMultiChunk: %zu heads, %d sync-retries, %lluus total\n", - pendings.size(), retryCount, (unsigned long long)_totalUs); - } - return firstErr; - } - - ErrorCode AppendBatchAsync(ExtraWorkSpace* p_exWorkSpace, - std::unordered_map& headAppends, - const char* caller) - { - if (headAppends.empty()) return ErrorCode::Success; - - std::vector> shardSafeRounds; - std::vector> roundShards; - for (const auto& entry : headAppends) { - unsigned shard = m_rwLocks.hash_func(entry.first); - bool placed = false; - for (size_t r = 0; r < shardSafeRounds.size(); r++) { - if (roundShards[r].insert(shard).second) { - shardSafeRounds[r].emplace(entry.first, entry.second); - placed = true; - break; - } - } - if (!placed) { - roundShards.emplace_back(); - roundShards.back().insert(shard); - shardSafeRounds.emplace_back(); - shardSafeRounds.back().emplace(entry.first, entry.second); - } - } - - if (shardSafeRounds.size() > 1) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "%s async append split: %zu heads into %zu shard-safe rounds\n", - caller, headAppends.size(), shardSafeRounds.size()); - ErrorCode firstErr = ErrorCode::Success; - for (auto& round : shardSafeRounds) { - ErrorCode ret = AppendBatchAsync(p_exWorkSpace, round, caller); - if (ret != ErrorCode::Success && firstErr == ErrorCode::Success) firstErr = ret; - } - return firstErr; - } - - // TiKV multi-chunk: fan out one async BatchPut per target head - // (chunk append + count update), then sync-retry failed heads. - if (IsMultiChunk() && headAppends.size() > 1) { - return AddIndexAsyncMultiChunk(p_exWorkSpace, headAppends, caller); - } - - // TiKV single-key: fan out Get, locally merge, then fan out Put. - if (!IsMultiChunk() && IsTiKV() && headAppends.size() > 1) { - return AddIndexAsyncSingleKey(p_exWorkSpace, headAppends); - } - - ErrorCode firstErr = ErrorCode::Success; - for (auto& [headID, posting] : headAppends) { - int appendNum = static_cast(posting.size() / m_vectorInfoSize); - ErrorCode ret = Append(p_exWorkSpace, headID, appendNum, posting, 0); - if (ret != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "%s Append failed for head %d, count %d\n", - caller, headID, appendNum); - if (firstErr == ErrorCode::Success) firstErr = ret; - } - } - return firstErr; - } - - // Single-key fast path mirror of AddIndexAsyncMultiChunk: parallel - // Get → local merge → parallel Put for N heads at once. Used when - // UseMultiChunkPosting=false on TiKV. Same lock & RAII discipline: - // Pass 1 sequential lock + reassign-on-deleted-head, Pass 2 fan-out - // AsyncRawGet, Pass 3 local RMW, Pass 4 fan-out AsyncRawPut, Pass 5 - // sync-retry failures via existing Append(). Splits triggered by - // post-merge size are scheduled via SplitAsync after the Put. - ErrorCode AddIndexAsyncSingleKey(ExtraWorkSpace* p_exWorkSpace, - std::unordered_map& headAppends) - { - auto* tikv = this->GetTiKVDB(); - if (!tikv) return ErrorCode::Fail; - - struct Pending { - SizeType headID = 0; - std::string* appendPosting = nullptr; // points into headAppends - int appendNum = 0; - std::string fullPosting; // filled by Get, then merged - std::atomic found{false}; - std::atomic getOk{0}; - std::atomic putOk{0}; - std::unique_lock lock; - bool active = true; // false → skipped (deleted/reassigned) - int finalSize = 0; // post-merge byte size, for split trigger - }; - - // Reserve so addresses passed to AsyncRawGet/Put are stable. - std::vector> pendings; - pendings.reserve(headAppends.size()); - - auto _phase2Begin = std::chrono::high_resolution_clock::now(); - - std::vector orderedHeads; - orderedHeads.reserve(headAppends.size()); - for (auto& entry : headAppends) orderedHeads.push_back(entry.first); - std::sort(orderedHeads.begin(), orderedHeads.end(), [this](SizeType a, SizeType b) { - unsigned sa = m_rwLocks.hash_func(a); - unsigned sb = m_rwLocks.hash_func(b); - return sa == sb ? a < b : sa < sb; - }); - - // ---- Pass 1: lock + reassign-on-deleted ---- - for (SizeType hid : orderedHeads) { - auto it = headAppends.find(hid); - if (it == headAppends.end()) continue; - auto& posting = it->second; - auto p = std::make_unique(); - p->headID = hid; - p->appendPosting = &posting; - p->appendNum = static_cast(posting.size() / m_vectorInfoSize); - - if (!m_headIndex->ContainSample(p->headID, m_layer + 1)) { - for (int i = 0; i < p->appendNum; i++) { - uint32_t idx = i * m_vectorInfoSize; - SizeType VID = *(SizeType*)(&(*p->appendPosting)[idx]); - uint8_t version = *(uint8_t*)(&(*p->appendPosting)[idx + sizeof(SizeType)]); - auto vectorInfo = std::make_shared( - p->appendPosting->c_str() + idx, m_vectorInfoSize); - if (m_versionMap->GetVersion(VID) == version) { - m_stat.m_headMiss++; - ReassignAsync(vectorInfo, p->headID); - } - } - continue; - } - - p->lock = std::unique_lock(m_rwLocks[p->headID]); - - if (!m_headIndex->ContainSample(p->headID, m_layer + 1)) { - p->lock.unlock(); - for (int i = 0; i < p->appendNum; i++) { - uint32_t idx = i * m_vectorInfoSize; - SizeType VID = *(SizeType*)(&(*p->appendPosting)[idx]); - uint8_t version = *(uint8_t*)(&(*p->appendPosting)[idx + sizeof(SizeType)]); - auto vectorInfo = std::make_shared( - p->appendPosting->c_str() + idx, m_vectorInfoSize); - if (m_versionMap->GetVersion(VID) == version) { - m_stat.m_headMiss++; - ReassignAsync(vectorInfo, p->headID); - } - } - continue; - } - - pendings.push_back(std::move(p)); - } - - if (pendings.empty()) return ErrorCode::Success; - - std::vector keys(pendings.size()); - for (size_t i = 0; i < pendings.size(); i++) { - SizeType k = DBKey(pendings[i]->headID); - keys[i] = std::string(reinterpret_cast(&k), sizeof(SizeType)); - } - - // ---- Pass 2: region-batched RawBatchGet for all target heads ---- - std::vector getValues; - std::vector getOk; - auto _getWaitBegin = std::chrono::high_resolution_clock::now(); - tikv->MultiGetWithStatus(keys, &getValues, &getOk, MaxTimeout, - &(p_exWorkSpace->m_diskRequests)); - tikv->RecordAsyncWait(TiKVIO::AsyncWaitKind::AddIndexSingleKeyGet, - pendings.size(), - static_cast(std::chrono::duration_cast( - std::chrono::high_resolution_clock::now() - _getWaitBegin).count())); - - // ---- Pass 3: local RMW. On Get failure, mark inactive and - // sync-retry via Append() in Pass 5 (preserves existing data-loss - // safety semantics: only "key absent" is treated as empty, every - // other Get failure aborts this head's RMW). - int activeCount = 0; - std::vector putKeys; - std::vector putValues; - std::vector putIndexByPending(pendings.size(), -1); - putKeys.reserve(pendings.size()); - putValues.reserve(pendings.size()); - for (size_t i = 0; i < pendings.size(); i++) { - auto& p = pendings[i]; - if (i >= getOk.size() || getOk[i] == 0) { - p->active = false; // sync-retry in Pass 5 - continue; - } - p->fullPosting = std::move(getValues[i]); - p->fullPosting.append(*p->appendPosting); - p->finalSize = static_cast(p->fullPosting.size()); - putIndexByPending[i] = static_cast(putKeys.size()); - putKeys.push_back(keys[i]); - putValues.push_back(std::move(p->fullPosting)); - activeCount++; - } - - std::vector putOk; - if (activeCount > 0) { - auto _putWaitBegin = std::chrono::high_resolution_clock::now(); - tikv->MultiPutWithStatus(putKeys, putValues, &putOk, MaxTimeout, - &(p_exWorkSpace->m_diskRequests)); - tikv->RecordAsyncWait(TiKVIO::AsyncWaitKind::AddIndexSingleKeyPut, - activeCount, - static_cast(std::chrono::duration_cast( - std::chrono::high_resolution_clock::now() - _putWaitBegin).count())); - } - - // ---- Pass 4: process results + post-merge SplitAsync ---- - ErrorCode firstErr = ErrorCode::Success; - int retryCount = 0; - for (size_t i = 0; i < pendings.size(); i++) { - auto& p = pendings[i]; - int putIndex = putIndexByPending[i]; - bool needRetry = !p->active || putIndex < 0 || - static_cast(putIndex) >= putOk.size() || - putOk[putIndex] == 0; - if (needRetry) { - // Drop our lock so Append() can re-acquire it cleanly. - p->lock.unlock(); - retryCount++; - auto ret = Append(p_exWorkSpace, p->headID, p->appendNum, *p->appendPosting); - if (ret != ErrorCode::Success && firstErr == ErrorCode::Success) { - firstErr = ret; - } - continue; - } - // Async path succeeded; mirror Append()'s post-Put split trigger - // (SplitAsync is idempotent w.r.t. m_splitList). - int postingSize = p->finalSize / m_vectorInfoSize; - if (postingSize > m_postingSizeLimit) { - m_stat.m_appendTriggeredSplit.fetch_add(1, std::memory_order_relaxed); - SplitAsync(p->headID, postingSize); - } - // Lock released on Pending dtor. - } - - uint64_t _totalUs = std::chrono::duration_cast( - std::chrono::high_resolution_clock::now() - _phase2Begin).count(); - if (retryCount > 0) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "AddIndexAsyncSingleKey: %zu heads, %d sync-retries, %lluus total\n", - pendings.size(), retryCount, (unsigned long long)_totalUs); - } - return firstErr; - } - private: int m_metaDataSize = 0; diff --git a/AnnService/inc/Core/SPANN/ExtraTiKVController.h b/AnnService/inc/Core/SPANN/ExtraTiKVController.h index c0af1d6ef..d7528d479 100644 --- a/AnnService/inc/Core/SPANN/ExtraTiKVController.h +++ b/AnnService/inc/Core/SPANN/ExtraTiKVController.h @@ -30,6 +30,73 @@ namespace SPTAG::SPANN { + // Simple sharded LRU cache for posting vector counts. + // Thread-safe: each shard has its own mutex. + class PostingCountCache { + public: + PostingCountCache(size_t capacity = 100000, int shards = 16) + : m_shards(shards), m_capacity(std::max(capacity / shards, (size_t)1)) { + m_data.resize(shards); + m_mutexes = std::make_unique(shards); + } + + // Returns (count, true) on hit, (0, false) on miss. + std::pair Get(SizeType headID) { + int s = Shard(headID); + std::lock_guard lock(m_mutexes[s]); + auto& shard = m_data[s]; + auto it = shard.map.find(headID); + if (it == shard.map.end()) return {0, false}; + // Move to front (most recently used) + shard.order.splice(shard.order.begin(), shard.order, it->second); + return {it->second->second, true}; + } + + void Put(SizeType headID, int count) { + int s = Shard(headID); + std::lock_guard lock(m_mutexes[s]); + auto& shard = m_data[s]; + auto it = shard.map.find(headID); + if (it != shard.map.end()) { + it->second->second = count; + shard.order.splice(shard.order.begin(), shard.order, it->second); + return; + } + // Evict if full + if (shard.map.size() >= m_capacity) { + auto& back = shard.order.back(); + shard.map.erase(back.first); + shard.order.pop_back(); + } + shard.order.emplace_front(headID, count); + shard.map[headID] = shard.order.begin(); + } + + void Remove(SizeType headID) { + int s = Shard(headID); + std::lock_guard lock(m_mutexes[s]); + auto& shard = m_data[s]; + auto it = shard.map.find(headID); + if (it != shard.map.end()) { + shard.order.erase(it->second); + shard.map.erase(it); + } + } + + private: + int Shard(SizeType headID) const { return static_cast(headID) % m_shards; } + + struct ShardData { + std::list> order; // front = MRU + std::unordered_map>::iterator> map; + }; + + int m_shards; + size_t m_capacity; // per shard + std::vector m_data; + std::unique_ptr m_mutexes; + }; + /// TiKVIO implements the KeyValueIO interface by communicating with a TiKV /// cluster via its RawKV gRPC API. /// @@ -112,7 +179,7 @@ namespace SPTAG::SPANN m_asyncWaitSampleCount[k].fetch_add(1, std::memory_order_relaxed); } - void LogAsyncWaitStatsAndReset(int layer) { + void LogAsyncWaitStatsAndReset(int layer) override { for (int k = 0; k < static_cast(AsyncWaitKind::Count); k++) { uint64_t samples = m_asyncWaitSampleCount[k].exchange(0, std::memory_order_relaxed); uint64_t totalUs = m_asyncWaitTotalUs[k].exchange(0, std::memory_order_relaxed); @@ -321,8 +388,8 @@ namespace SPTAG::SPANN } }; - TiKVIO(const std::string& pdAddresses, const std::string& keyPrefix, int asyncRpcMaxInflight = 0) - : m_keyPrefix(keyPrefix), + TiKVIO(const std::string& pdAddresses, const std::string& keyPrefix, bool useMultiChunkPosting, int postingCountCacheCapacity, int asyncRpcMaxInflight = 0) + : m_keyPrefix(keyPrefix), m_useMultiChunkPosting(useMultiChunkPosting), m_asyncRpcMaxInflight(std::max(asyncRpcMaxInflight, 0)) { // Parse comma-separated PD addresses and try to connect. @@ -403,6 +470,12 @@ namespace SPTAG::SPANN return; } + // Initialize posting count cache for multi-chunk mode + if (m_useMultiChunkPosting) { + postingCountCacheCapacity = max(postingCountCacheCapacity, 1); + m_postingCountCache = std::make_unique(postingCountCacheCapacity, 16); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "PostingCountCache initialized (capacity=%d, shards=16)\n", postingCountCacheCapacity); + } m_available = true; SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "TiKVIO: Initialized with key prefix '%s'\n", m_keyPrefix.c_str()); if (m_asyncRpcMaxInflight > 0) { @@ -515,6 +588,9 @@ namespace SPTAG::SPANN const std::chrono::microseconds& timeout, std::vector* reqs) override { + if (m_useMultiChunkPosting) { + return ScanPosting(key, value, timeout); + } std::string k(reinterpret_cast(&key), sizeof(SizeType)); return Get(k, value, timeout, reqs); } @@ -570,7 +646,23 @@ namespace SPTAG::SPANN ErrorCode Put(const SizeType key, const std::string& value, const std::chrono::microseconds& timeout, std::vector* reqs) override - { + { + if (m_useMultiChunkPosting) { + auto delRet = DeletePosting(key); + if (delRet != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "PutPostingToDB: DeletePosting failed for key %d\n", key); + return delRet; + } + int count = static_cast(value.size()); + auto ret = PutBaseChunkAndCount(key, value, count, timeout, reqs); + if (ret != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "PutPostingToDB: PutBaseChunkAndCount failed for key %d\n", key); + return ret; + } + if (m_postingCountCache) m_postingCountCache->Put(key, count); + return ErrorCode::Success; + } + std::string k(reinterpret_cast(&key), sizeof(SizeType)); return Put(k, value, timeout, reqs); } @@ -578,6 +670,15 @@ namespace SPTAG::SPANN // ---- Delete operations ---- ErrorCode Delete(SizeType key) override { + if (m_useMultiChunkPosting) { + auto countRet = DeletePostingCount(key); + if (countRet != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "DeletePostingFromDB: DeletePostingCount failed for headID %d\n", key); + } + if (m_postingCountCache) m_postingCountCache->Remove(key); + return DeletePosting(key); + } + std::string k(reinterpret_cast(&key), sizeof(SizeType)); std::string prefixedKey = MakePrefixedKey(k); @@ -680,18 +781,45 @@ namespace SPTAG::SPANN return ErrorCode::Fail; } - std::string existingValue; - auto ret = Get(key, &existingValue, timeout, reqs); - if (ret != ErrorCode::Success) { - // Key doesn't exist yet, just put the new value. - size = static_cast(value.size()); - return Put(key, value, timeout, reqs); - } + if (m_useMultiChunkPosting) { + auto [count, hit] = m_postingCountCache->Get(key); + if (!hit) { + count = GetPostingCount(key, std::chrono::microseconds(5000000)); + if (count < 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "GetCachedPostingCount: TiKV error for headID %d, returning 0\n", key); + return ErrorCode::Posting_SizeError; + } + m_postingCountCache->Put(key, count); + } + { static std::atomic _logOnce{0}; if (_logOnce.fetch_add(1) == 0) SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "[PATH] Append using MULTI-CHUNK AppendChunk path\n"); } - // Append the new value to existing - existingValue.append(value); - size = static_cast(existingValue.size()); - return Put(key, existingValue, timeout, reqs); + int newCount = count + value.size(); + auto ret =PutChunkAndCount(key, value, newCount, timeout, reqs); + if (ret != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "MultiChunkAppend failed for %lld!\n", (std::int64_t)key); + return ret; + } + if (m_postingCountCache) m_postingCountCache->Put(key, newCount); + size = newCount; + } else { + { static std::atomic _logOnce{0}; if (_logOnce.fetch_add(1) == 0) SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "[PATH] Append using SINGLE-KEY Get+Put path (no multi-chunk)\n"); } + std::string fullPosting; + auto ret = Get(key, &fullPosting, MaxTimeout, reqs); + if (ret == ErrorCode::Key_NotFound) { + fullPosting.clear(); + } else if (ret != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Merge failed to read existing posting for %lld before append.\n", (std::int64_t)key); + return ret; + } + + fullPosting.append(value); + size = static_cast(fullPosting.size()); + if ((ret = Put(key, fullPosting, MaxTimeout, reqs)) != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Merge failed for %lld! Posting Size:%d\n", (std::int64_t)key, size); + return ret; + } + } + return ErrorCode::Success; } // ---- MultiGet operations ---- @@ -703,7 +831,11 @@ namespace SPTAG::SPANN std::vector>& values, const std::chrono::microseconds& timeout, std::vector* reqs) override - { + { + if (m_useMultiChunkPosting) { + return MultiScanPostings(keys, values, timeout); + } + if (keys.empty()) return ErrorCode::Success; struct PendingRegionGroup { @@ -1063,6 +1195,146 @@ namespace SPTAG::SPANN return failCount == 0 ? ErrorCode::Success : ErrorCode::Fail; } + ErrorCode MultiMerge(const std::vector& keys, const std::vector& values, + const std::chrono::microseconds& timeout, std::vector* reqs, std::vector& sizes) override + { + if (keys.empty()) { + sizes.clear(); + return ErrorCode::Success; + } + + ErrorCode firstErr = ErrorCode::Success; + sizes.resize(keys.size()); + if (m_useMultiChunkPosting) { + std::vector fetchedCounts; + ErrorCode countRet = AsyncGetPostingCounts(keys, &fetchedCounts, + std::chrono::microseconds(5000000)); + if (countRet != ErrorCode::Success || fetchedCounts.size() != keys.size()) { + if (fetchedCounts.size() != keys.size()) fetchedCounts.assign(keys.size(), -1); + for (size_t i = 0; i < keys.size(); i++) { + if (fetchedCounts[i] >= 0) continue; + fetchedCounts[i] = GetPostingCount(keys[i], MaxTimeout); + } + } + for (size_t i = 0; i < keys.size(); i++) { + if (fetchedCounts[i] < 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "TiKVIO::MultiMerge failed to fetch posting count headID=%d\n", keys[i]); + return ErrorCode::Fail; + } + } + + auto batch = std::make_shared(); + batch->Add(static_cast(keys.size())); + + // std::vector> is non-resizable but in-place + // constructible to size N with zero-initialization. + std::vector> okFlags(keys.size()); + for (auto& f : okFlags) f.store(0, std::memory_order_relaxed); + + for (size_t i = 0; i < keys.size(); i++) { + int newCount = fetchedCounts[i] + static_cast(values[i].size()); + AsyncAppendChunkAndUpdateCount( + keys[i], values[i], newCount, + batch, &okFlags[i], MaxTimeout); + } + + // ---- Pass 3: single thread blocks on wait-group ---- + auto _waitBegin = std::chrono::high_resolution_clock::now(); + batch->Wait(); + auto _waitEnd = std::chrono::high_resolution_clock::now(); + RecordAsyncWait(TiKVIO::AsyncWaitKind::CollectReAssignMultiChunk, + keys.size(), + static_cast(std::chrono::duration_cast( + _waitEnd - _waitBegin).count())); + + // ---- Pass 4: process results, sync-retry failures ---- + int retryCount = 0; + for (size_t i = 0; i < keys.size(); i++) { + sizes[i] = fetchedCounts[i] + static_cast(values[i].size()); + if (okFlags[i].load(std::memory_order_acquire) == 1) { + if (m_postingCountCache) m_postingCountCache->Put(keys[i], sizes[i]); + } else { + // Sync retry preserves the existing region-cache invalidation + // + retry semantics already battle-tested in PutChunkAndCount. + retryCount++; + auto ret = PutChunkAndCount(keys[i], values[i], sizes[i], MaxTimeout, reqs); + if (ret == ErrorCode::Success && m_postingCountCache) { + m_postingCountCache->Put(keys[i], sizes[i]); + } + if (ret != ErrorCode::Success && firstErr == ErrorCode::Success) { + firstErr = ret; + } + } + } + } + else { + std::vector getKeys(keys.size()); + std::vector getValues; + std::vector getOk; + for (int i = 0; i < keys.size(); i++) getKeys[i] = std::string(reinterpret_cast(&keys[i]), sizeof(SizeType)); + + auto _getWaitBegin = std::chrono::high_resolution_clock::now(); + MultiGetWithStatus(getKeys, &getValues, &getOk, MaxTimeout, reqs); + RecordAsyncWait(TiKVIO::AsyncWaitKind::AddIndexSingleKeyGet, + keys.size(), + static_cast(std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - _getWaitBegin).count())); + + int activeCount = 0; + std::vector putKeys; + std::vector putValues; + std::vector putIndexByPending(keys.size(), -1); + putValues.reserve(keys.size()); + for (size_t i = 0; i < keys.size(); i++) { + if (i >= getOk.size() || getOk[i] == 0) { + continue; + } + getValues[i].append(values[i]); + sizes[i] = static_cast(getValues[i].size()); + putIndexByPending[i] = static_cast(putKeys.size()); + putKeys.push_back(getKeys[i]); + putValues.push_back(std::move(getValues[i])); + activeCount++; + } + + std::vector putOk; + if (activeCount > 0) { + auto _putWaitBegin = std::chrono::high_resolution_clock::now(); + MultiPutWithStatus(putKeys, putValues, &putOk, MaxTimeout, reqs); + RecordAsyncWait(TiKVIO::AsyncWaitKind::AddIndexSingleKeyPut, + activeCount, + static_cast(std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - _putWaitBegin).count())); + } + + // ---- Pass 4: process results + post-merge SplitAsync ---- + int retryCount = 0; + for (size_t i = 0; i < keys.size(); i++) { + int putIndex = putIndexByPending[i]; + bool needRetry = putIndex < 0 || + static_cast(putIndex) >= putOk.size() || + putOk[putIndex] == 0; + if (needRetry) { + retryCount++; + auto ret = Merge(keys[i], values[i], MaxTimeout, reqs, sizes[i]); + if (ret != ErrorCode::Success && firstErr == ErrorCode::Success) { + firstErr = ret; + } + } + if (m_postingCountCache) { + m_postingCountCache->Put(keys[i], sizes[i]); + } + } + if (retryCount > 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "AddIndexAsyncSingleKey: %zu heads, %d sync-retries\n", + keys.size(), retryCount); + } + } + return firstErr; + } + // ---- MultiPut / MultiDelete operations ---- // Group keys by (leader address, region id) and issue one RawBatchPut / // RawBatchDelete per region. Region groups run in parallel via std::async, @@ -1078,7 +1350,7 @@ namespace SPTAG::SPANN ErrorCode MultiPut(const std::vector& keys, const std::vector& values, const std::chrono::microseconds& timeout, - std::vector* reqs) override + std::vector* reqs) { if (keys.empty()) return ErrorCode::Success; if (keys.size() != values.size()) { @@ -1095,7 +1367,7 @@ namespace SPTAG::SPANN } ErrorCode MultiDelete(const std::vector& keys, - const std::chrono::microseconds& timeout) override + const std::chrono::microseconds& timeout) { if (keys.empty()) return ErrorCode::Success; std::vector prefixedKeys(keys.size()); @@ -1987,8 +2259,10 @@ namespace SPTAG::SPANN // Same as PutChunkAndCount but writes the BASE chunk (no timestamp suffix). // Used by PutPostingToDB compaction path: replaces (overwrites) the base - // chunk and updates the count in a single RawBatchPut RPC. Saves one - // round trip vs separate PutBaseChunk + SetPostingCount. + // chunk and updates the count in a single RawBatchPut RPC. Do not fall + // back to separate writes here: count is required metadata for + // multi-chunk postings, so partial base/count updates must surface as + // failures instead of silently corrupting future append counts. ErrorCode PutBaseChunkAndCount(SizeType headID, const std::string& chunkValue, int newCount, @@ -2000,56 +2274,46 @@ namespace SPTAG::SPANN { auto stub = GetStubForKey(chunkKey); - if (stub) { - kvrpcpb::RawBatchPutRequest request; - SetContext(request.mutable_context(), chunkKey); + if (!stub) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "TiKVIO::PutBaseChunkAndCount missing TiKV stub headID=%d\n", headID); + return ErrorCode::Fail; + } - auto* p1 = request.add_pairs(); - p1->set_key(chunkKey); - p1->set_value(chunkValue); + kvrpcpb::RawBatchPutRequest request; + SetContext(request.mutable_context(), chunkKey); - auto* p2 = request.add_pairs(); - p2->set_key(countKey); - p2->set_value(countValue); + auto* p1 = request.add_pairs(); + p1->set_key(chunkKey); + p1->set_value(chunkValue); - kvrpcpb::RawBatchPutResponse response; - grpc::ClientContext ctx; - SetDeadline(ctx, timeout); + auto* p2 = request.add_pairs(); + p2->set_key(countKey); + p2->set_value(countValue); - auto status = stub->RawBatchPut(&ctx, request, &response); - if (status.ok() && !response.has_region_error() && response.error().empty()) { - return ErrorCode::Success; - } - if (!status.ok()) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "TiKVIO::PutBaseChunkAndCount BatchPut gRPC error headID=%d: %s, falling back\n", - headID, status.error_message().c_str()); - } else if (response.has_region_error()) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "TiKVIO::PutBaseChunkAndCount BatchPut region_error headID=%d, falling back\n", headID); - } else { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "TiKVIO::PutBaseChunkAndCount error: %s\n", response.error().c_str()); - } - InvalidateRegionCache(chunkKey); - InvalidateRegionCache(countKey); - } - } + kvrpcpb::RawBatchPutResponse response; + grpc::ClientContext ctx; + SetDeadline(ctx, timeout); - // Fallback: write chunk and count separately. - auto ret1 = RawPutWithRetry(chunkKey, chunkValue, timeout); - if (ret1 != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "TiKVIO::PutBaseChunkAndCount fallback: PutBaseChunk failed headID=%d\n", headID); - return ret1; - } - auto ret2 = RawPutWithRetry(countKey, countValue, timeout); - if (ret2 != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, - "TiKVIO::PutBaseChunkAndCount fallback: PutCount failed headID=%d\n", headID); - return ret2; + auto status = stub->RawBatchPut(&ctx, request, &response); + if (status.ok() && !response.has_region_error() && response.error().empty()) { + return ErrorCode::Success; + } + if (!status.ok()) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "TiKVIO::PutBaseChunkAndCount BatchPut gRPC error headID=%d: %s\n", + headID, status.error_message().c_str()); + } else if (response.has_region_error()) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "TiKVIO::PutBaseChunkAndCount BatchPut region_error headID=%d\n", headID); + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "TiKVIO::PutBaseChunkAndCount error: %s\n", response.error().c_str()); + } + InvalidateRegionCache(chunkKey); + InvalidateRegionCache(countKey); } - return ErrorCode::Success; + return ErrorCode::Fail; } // Multi-posting scan: read multiple postings in parallel. @@ -2767,6 +3031,11 @@ namespace SPTAG::SPANN std::vector> m_scanResults; size_t m_scanIndex = 0; + // Posting count cache for multi-chunk mode. + // Tracks approximate vector count per posting to decide when to split. + bool m_useMultiChunkPosting = false; + std::unique_ptr m_postingCountCache; + // ---- Helper: build a prefixed key ---- std::string MakePrefixedKey(const std::string& key) const { std::string result; diff --git a/AnnService/inc/Core/SPANN/Options.h b/AnnService/inc/Core/SPANN/Options.h index 5c9c61a6f..2c9c8865e 100644 --- a/AnnService/inc/Core/SPANN/Options.h +++ b/AnnService/inc/Core/SPANN/Options.h @@ -34,6 +34,7 @@ namespace SPTAG { std::string m_truthPath; TruthFileType m_truthType; bool m_generateTruth; + std::string m_globalIDPath; std::string m_indexDirectory; std::string m_headIDFile; std::string m_headVectorFile; diff --git a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h index b96895e82..50823168d 100644 --- a/AnnService/inc/Core/SPANN/ParameterDefinitionList.h +++ b/AnnService/inc/Core/SPANN/ParameterDefinitionList.h @@ -23,6 +23,7 @@ DefineBasicParameter(m_warmupDelimiter, std::string, std::string("|"), "WarmupDe DefineBasicParameter(m_truthPath, std::string, std::string(""), "TruthPath") DefineBasicParameter(m_truthType, SPTAG::TruthFileType, SPTAG::TruthFileType::Undefined, "TruthType") DefineBasicParameter(m_generateTruth, bool, false, "GenerateTruth") // Mutable +DefineBasicParameter(m_globalIDPath, std::string, std::string(""), "GlobalIDPath") DefineBasicParameter(m_indexDirectory, std::string, std::string("SPANN"), "IndexDirectory") DefineBasicParameter(m_headIDFile, std::string, std::string("SPTAGHeadVectorIDs.bin"), "HeadVectorIDs") DefineBasicParameter(m_deleteIDFile, std::string, std::string("DeletedIDs.bin"), "DeletedIDs") diff --git a/AnnService/inc/Helper/KeyValueIO.h b/AnnService/inc/Helper/KeyValueIO.h index fda983b3d..a7c3c25b8 100644 --- a/AnnService/inc/Helper/KeyValueIO.h +++ b/AnnService/inc/Helper/KeyValueIO.h @@ -34,24 +34,25 @@ namespace SPTAG virtual ErrorCode Put(const SizeType key, const std::string& value, const std::chrono::microseconds& timeout, std::vector* reqs) = 0; - // Batched writes/deletes. Default implementations return Undefined so that - // backends without native batching (RocksDB, FileIO) can ignore them. - // TiKVIO overrides these to issue a single batched RPC per region group, - // which dramatically reduces the number of synchronous gRPC round-trips - // when callers (e.g. SPANN AddIndex Phase 2 / PutPostingToDB) want to - // commit several keys at once. - virtual ErrorCode MultiPut(const std::vector& keys, - const std::vector& values, - const std::chrono::microseconds& timeout, - std::vector* reqs) { return ErrorCode::Undefined; } - - virtual ErrorCode MultiDelete(const std::vector& keys, - const std::chrono::microseconds& timeout) { return ErrorCode::Undefined; } - virtual ErrorCode Merge(const SizeType key, const std::string &value, const std::chrono::microseconds &timeout, std::vector *reqs, int& size) = 0; + virtual ErrorCode MultiMerge(const std::vector& keys, const std::vector& values, + const std::chrono::microseconds& timeout, std::vector* reqs, std::vector& sizes) { + if (keys.size() != values.size()) { + return ErrorCode::Undefined; + } + sizes.resize(keys.size()); + for (size_t i = 0; i < keys.size(); i++) { + auto err = Merge(keys[i], values[i], timeout, reqs, sizes[i]); + if (err != ErrorCode::Success) { + return err; + } + } + return ErrorCode::Success; + } + virtual ErrorCode Delete(SizeType key) = 0; virtual ErrorCode DeleteRange(SizeType start, SizeType end) {return ErrorCode::Undefined;} @@ -79,6 +80,8 @@ namespace SPTAG virtual ErrorCode StartToScan(SizeType& key, std::string* value) {return ErrorCode::Undefined;} virtual ErrorCode NextToScan(SizeType& key, std::string* value) {return ErrorCode::Undefined;} + + virtual void LogAsyncWaitStatsAndReset(int layer) {} }; } } diff --git a/AnnService/inc/Helper/ThreadPool.h b/AnnService/inc/Helper/ThreadPool.h index 6aee44b30..01c82e2a7 100644 --- a/AnnService/inc/Helper/ThreadPool.h +++ b/AnnService/inc/Helper/ThreadPool.h @@ -5,7 +5,7 @@ #define _SPTAG_HELPER_THREADPOOL_H_ #include -#include +#include #include #include #include @@ -78,7 +78,16 @@ namespace SPTAG { { std::lock_guard lock(m_lock); - m_jobs.push(j); + m_jobs.push_back(j); + } + m_cond.notify_one(); + } + + void addfront(Job* j) + { + { + std::lock_guard lock(m_lock); + m_jobs.push_front(j); } m_cond.notify_one(); } @@ -90,7 +99,7 @@ namespace SPTAG if (!m_abort.ShouldAbort()) { j = m_jobs.front(); currentJobs++; - m_jobs.pop(); + m_jobs.pop_front(); return true; } return false; @@ -113,7 +122,7 @@ namespace SPTAG protected: std::atomic_uint32_t currentJobs{ 0 }; - std::queue m_jobs; + std::deque m_jobs; Abort m_abort; std::mutex m_lock; std::condition_variable m_cond; diff --git a/AnnService/src/BalancedDataPartition/main.cpp b/AnnService/src/BalancedDataPartition/main.cpp index 886b10790..7bddc0ac8 100644 --- a/AnnService/src/BalancedDataPartition/main.cpp +++ b/AnnService/src/BalancedDataPartition/main.cpp @@ -21,6 +21,11 @@ using namespace SPTAG; } typedef short LabelType; +#ifndef LARGEVID +#define MPIVIDTYPE MPI_INT +#else +#define MPIVIDTYPE MPI_LONG_LONG +#endif class PartitionOptions : public Helper::ReaderOptions { @@ -36,6 +41,7 @@ class PartitionOptions : public Helper::ReaderOptions AddOptionalOption(m_distMethod, "-m", "--dist", "Distance method (L2 or Cosine)."); AddOptionalOption(m_outdir, "-o", "--outdir", "Output directory."); AddOptionalOption(m_weightfile, "-w", "--weight", "vector weight file."); + AddOptionalOption(m_gidfile, "-gid", "--gid", "global id file."); AddOptionalOption(m_wlambda, "-lw", "--wlambda", "lambda for balanced weight level."); AddOptionalOption(m_seed, "-e", "--seed", "Random seed."); AddOptionalOption(m_initIter, "-x", "--init", "Number of iterations for initialization."); @@ -82,6 +88,7 @@ class PartitionOptions : public Helper::ReaderOptions std::string m_outfile = "vectors.bin"; std::string m_outmetafile = "meta.bin"; std::string m_outmetaindexfile = "metaindex.bin"; + std::string m_gidfile = "-"; std::string m_weightfile = "-"; std::string m_stage = "Clustering"; std::string m_status = "."; @@ -495,6 +502,19 @@ template void Process(MPI_Datatype type) win.read((char *)weights.data(), sizeof(float) * rows); win.close(); } + + std::shared_ptr> globalids = nullptr; + if (options.m_gidfile.compare("-") != 0) + { + options.m_gidfile = Helper::StrUtils::ReplaceAll(options.m_gidfile, "*", std::to_string(rank)); + globalids = std::make_shared>(); + if (ErrorCode::Success != globalids->Load(options.m_gidfile, 1024 * 1024, vectors->Count() + 1)) + { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Rank %d failed to read global ID file %s.\n", rank, + options.m_gidfile.c_str()); + exit(1); + } + } COMMON::Dataset data(vectors->Count(), vectors->Dimension(), 1024 * 1024, vectors->Count() + 1, (T *)vectors->GetData()); COMMON::KmeansArgs args(options.m_clusterNum, vectors->Dimension(), vectors->Count(), options.m_threadNum, @@ -664,7 +684,8 @@ template void Process(MPI_Datatype type) std::string metafile = options.m_outdir + "/" + options.m_outmetafile + "." + std::to_string(i); std::string metaindexfile = options.m_outdir + "/" + options.m_outmetaindexfile + "." + std::to_string(i); - std::shared_ptr out = f_createIO(), metaout = f_createIO(), metaindexout = f_createIO(); + std::string gidfile = options.m_outdir + "/" + options.m_gidfile + "." + std::to_string(i); + std::shared_ptr out = f_createIO(), metaout = f_createIO(), metaindexout = f_createIO(), gidout = f_createIO(); if (out == nullptr || !out->Initialize(vecfile.c_str(), std::ios::binary | std::ios::out)) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Cannot open %s to write.\n", vecfile.c_str()); @@ -681,12 +702,18 @@ template void Process(MPI_Datatype type) SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Cannot open %s to write.\n", metaindexfile.c_str()); exit(1); } - - CHECKIO(out, WriteBinary, sizeof(int), (char *)(&args.counts[i])); - CHECKIO(out, WriteBinary, sizeof(int), (char *)(&args._D)); + if (globalids != nullptr && (gidout == nullptr || !gidout->Initialize(gidfile.c_str(), std::ios::binary | std::ios::out))) + { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Cannot open %s to write.\n", gidfile.c_str()); + exit(1); + } + CHECKIO(out, WriteBinary, sizeof(SizeType), (char *)(&args.counts[i])); + CHECKIO(out, WriteBinary, sizeof(DimensionType), (char *)(&args._D)); if (metas != nullptr) - CHECKIO(metaindexout, WriteBinary, sizeof(int), (char *)(&args.counts[i])); - + CHECKIO(metaindexout, WriteBinary, sizeof(SizeType), (char *)(&args.counts[i])); + if (globalids != nullptr) { + CHECKIO(gidout, WriteBinary, sizeof(SizeType), (char *)(&args.counts[i])); + } std::uint64_t offset = 0; T *recvbuf = args.newTCenters; int recvmetabuflen = 200; @@ -696,9 +723,9 @@ template void Process(MPI_Datatype type) uint64_t offset_before = offset; if (j != rank) { - int recv = 0; - MPI_Recv(&recv, 1, MPI_INT, j, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - for (int k = 0; k < recv; k++) + SizeType recv = 0; + MPI_Recv(&recv, 1, MPIVIDTYPE, j, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + for (SizeType k = 0; k < recv; k++) { MPI_Recv(recvbuf, args._D, type, j, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE); CHECKIO(out, WriteBinary, sizeof(T) * args._D, (char *)recvbuf); @@ -719,6 +746,12 @@ template void Process(MPI_Datatype type) CHECKIO(metaindexout, WriteBinary, sizeof(std::uint64_t), (char *)(&offset)); offset += len; } + if (globalids != nullptr) + { + SizeType gid; + MPI_Recv(&gid, 1, MPIVIDTYPE, j, 4, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + CHECKIO(gidout, WriteBinary, sizeof(SizeType), (char *)(&gid)); + } } SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "rank %d <- rank %d: %d vectors, %llu bytes meta\n", rank, j, recv, (offset - offset_before)); @@ -726,9 +759,9 @@ template void Process(MPI_Datatype type) else { size_t total_rec = 0; - for (int k = 0; k < data.R(); k++) + for (SizeType k = 0; k < data.R(); k++) { - for (int kk = 0; kk < label.C(); kk++) + for (DimensionType kk = 0; kk < label.C(); kk++) { if (label[k][kk] == (LabelType)i) { @@ -740,6 +773,10 @@ template void Process(MPI_Datatype type) CHECKIO(metaindexout, WriteBinary, sizeof(std::uint64_t), (char *)(&offset)); offset += meta.Length(); } + if (globalids != nullptr) { + SizeType gid = *((*globalids)[localindices[k]]); + CHECKIO(gidout, WriteBinary, sizeof(SizeType), (char *)(&gid)); + } total_rec++; } } @@ -754,16 +791,17 @@ template void Process(MPI_Datatype type) out->ShutDown(); metaout->ShutDown(); metaindexout->ShutDown(); + if (globalids != nullptr) gidout->ShutDown(); } else { int dest = i % size; - MPI_Send(&args.newCounts[i], 1, MPI_INT, dest, 0, MPI_COMM_WORLD); + MPI_Send(&args.newCounts[i], 1, MPIVIDTYPE, dest, 0, MPI_COMM_WORLD); size_t total_len = 0; size_t total_rec = 0; - for (int j = 0; j < data.R(); j++) + for (SizeType j = 0; j < data.R(); j++) { - for (int kk = 0; kk < label.C(); kk++) + for (DimensionType kk = 0; kk < label.C(); kk++) { if (label[j][kk] == (LabelType)i) { @@ -776,6 +814,10 @@ template void Process(MPI_Datatype type) MPI_Send(meta.Data(), len, MPI_CHAR, dest, 3, MPI_COMM_WORLD); total_len += len; } + if (globalids != nullptr) { + SizeType gid = *((*globalids)[localindices[j]]); + MPI_Send(&gid, 1, MPIVIDTYPE, dest, 4, MPI_COMM_WORLD); + } total_rec++; } } @@ -825,12 +867,12 @@ ErrorCode SyncSaveCenter(COMMON::KmeansArgs &args, int rank, int iteration, u CHECKIO(out, WriteBinary, sizeof(float) * args._K * args._D, (const char *)args.newCenters); if (assign) { - CHECKIO(out, WriteBinary, sizeof(int) * args._K, (const char *)args.counts); + CHECKIO(out, WriteBinary, sizeof(SizeType) * args._K, (const char *)args.counts); CHECKIO(out, WriteBinary, sizeof(float) * args._K, (const char *)args.weightedCounts); } else { - CHECKIO(out, WriteBinary, sizeof(int) * args._K, (const char *)args.newCounts); + CHECKIO(out, WriteBinary, sizeof(SizeType) * args._K, (const char *)args.newCounts); CHECKIO(out, WriteBinary, sizeof(float) * args._K, (const char *)args.newWeightedCounts); } out->ShutDown(); @@ -898,7 +940,7 @@ ErrorCode SyncLoadCenter(COMMON::KmeansArgs &args, int rank, int iteration, u } memset(args.newCenters, 0, sizeof(float) * args._K * args._D); - memset(args.counts, 0, sizeof(int) * args._K); + memset(args.counts, 0, sizeof(SizeType) * args._K); memset(args.weightedCounts, 0, sizeof(float) * args._K); std::unique_ptr buf(new char[sizeof(float) * args._K * args._D]); unsigned long long localCount; @@ -926,10 +968,10 @@ ErrorCode SyncLoadCenter(COMMON::KmeansArgs &args, int rank, int iteration, u for (int i = 0; i < args._K * args._D; i++) args.newCenters[i] += *((float *)(buf.get()) + i); - CHECKIO(input, ReadBinary, sizeof(int) * args._K, buf.get()); + CHECKIO(input, ReadBinary, sizeof(SizeType) * args._K, buf.get()); for (int i = 0; i < args._K; i++) { - int partsize = *((int *)(buf.get()) + i); + SizeType partsize = *((SizeType *)(buf.get()) + i); if (partsize >= 0 && args.counts[i] <= MaxSize - partsize) args.counts[i] += partsize; else @@ -1181,7 +1223,16 @@ template void Partition() SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Failed to read labels.\n"); exit(1); } - + std::shared_ptr> globalids = nullptr; + if (options.m_gidfile.compare("-") != 0) + { + globalids = std::make_shared>(); + if (ErrorCode::Success != globalids->Load(options.m_gidfile, 1024 * 1024, vectors->Count() + 1)) + { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Failed to read global ID file %s.\n", options.m_gidfile.c_str()); + exit(1); + } + } std::string taskId = options.m_labels.substr(options.m_labels.rfind(".") + 1); for (int i = 0; i < options.m_clusterNum; i++) { @@ -1189,7 +1240,8 @@ template void Partition() std::string metafile = options.m_outdir + "/" + options.m_outmetafile + "." + taskId + "." + std::to_string(i); std::string metaindexfile = options.m_outdir + "/" + options.m_outmetaindexfile + "." + taskId + "." + std::to_string(i); - std::shared_ptr out = f_createIO(), metaout = f_createIO(), metaindexout = f_createIO(); + std::string gidfile = options.m_outdir + "/" + options.m_gidfile + "." + taskId + "." + std::to_string(i); + std::shared_ptr out = f_createIO(), metaout = f_createIO(), metaindexout = f_createIO(), gidout = f_createIO(); if (out == nullptr || !out->Initialize(vecfile.c_str(), std::ios::binary | std::ios::out)) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Cannot open %s to write.\n", vecfile.c_str()); @@ -1206,18 +1258,25 @@ template void Partition() SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Cannot open %s to write.\n", metaindexfile.c_str()); exit(1); } - - int rows = data.R(), cols = data.C(); - CHECKIO(out, WriteBinary, sizeof(int), (char *)(&rows)); - CHECKIO(out, WriteBinary, sizeof(int), (char *)(&cols)); + if (globalids != nullptr && (gidout == nullptr || !gidout->Initialize(gidfile.c_str(), std::ios::binary | std::ios::out))) + { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Cannot open %s to write.\n", gidfile.c_str()); + exit(1); + } + SizeType rows = data.R(); + DimensionType cols = data.C(); + CHECKIO(out, WriteBinary, sizeof(SizeType), (char *)(&rows)); + CHECKIO(out, WriteBinary, sizeof(DimensionType), (char *)(&cols)); if (metas != nullptr) - CHECKIO(metaindexout, WriteBinary, sizeof(int), (char *)(&rows)); - + CHECKIO(metaindexout, WriteBinary, sizeof(SizeType), (char *)(&rows)); + if (globalids != nullptr) { + CHECKIO(gidout, WriteBinary, sizeof(SizeType), (char *)(&rows)); + } std::uint64_t offset = 0; - int records = 0; - for (int k = 0; k < data.R(); k++) + SizeType records = 0; + for (SizeType k = 0; k < data.R(); k++) { - for (int kk = 0; kk < label.C(); kk++) + for (DimensionType kk = 0; kk < label.C(); kk++) { if (label[k][kk] == (LabelType)i) { @@ -1229,6 +1288,10 @@ template void Partition() CHECKIO(metaindexout, WriteBinary, sizeof(std::uint64_t), (char *)(&offset)); offset += meta.Length(); } + if (globalids != nullptr) { + SizeType gid = *((*globalids)[k]); + CHECKIO(gidout, WriteBinary, sizeof(SizeType), (char *)(&gid)); + } records++; } } @@ -1238,12 +1301,15 @@ template void Partition() if (metas != nullptr) CHECKIO(metaindexout, WriteBinary, sizeof(std::uint64_t), (char *)(&offset)); - CHECKIO(out, WriteBinary, sizeof(int), (char *)(&records), 0); - CHECKIO(metaindexout, WriteBinary, sizeof(int), (char *)(&records), 0); - + CHECKIO(out, WriteBinary, sizeof(SizeType), (char *)(&records), 0); + CHECKIO(metaindexout, WriteBinary, sizeof(SizeType), (char *)(&records), 0); + if (globalids != nullptr) { + CHECKIO(gidout, WriteBinary, sizeof(SizeType), (char *)(&records), 0); + } out->ShutDown(); metaout->ShutDown(); metaindexout->ShutDown(); + if (globalids != nullptr) gidout->ShutDown(); } } diff --git a/AnnService/src/Core/SPANN/ExtraFileController.cpp b/AnnService/src/Core/SPANN/ExtraFileController.cpp index b5db83822..24c839455 100644 --- a/AnnService/src/Core/SPANN/ExtraFileController.cpp +++ b/AnnService/src/Core/SPANN/ExtraFileController.cpp @@ -25,7 +25,7 @@ bool FileIO::BlockController::Initialize(SPANN::Options &p_opt, int p_layer) #ifndef _MSC_VER O_RDWR | O_DIRECT, numblocks, 2, 2, max(p_opt.m_ioThreads, (2 * max(p_opt.m_searchThreadNum, p_opt.m_iSSDNumberOfThreads) + - p_opt.m_insertThreadNum + p_opt.m_reassignThreadNum + p_opt.m_appendThreadNum)), + (p_opt.m_layers + 1) * (p_opt.m_insertThreadNum + p_opt.m_reassignThreadNum + p_opt.m_appendThreadNum))), ((std::uint64_t)p_opt.m_startFileSize) << 30 #else GENERIC_READ | GENERIC_WRITE, numblocks, 2, 2, diff --git a/AnnService/src/Core/SPANN/SPANNIndex.cpp b/AnnService/src/Core/SPANN/SPANNIndex.cpp index ccda4476a..f3f83dca6 100644 --- a/AnnService/src/Core/SPANN/SPANNIndex.cpp +++ b/AnnService/src/Core/SPANN/SPANNIndex.cpp @@ -1100,22 +1100,24 @@ template ErrorCode Index::BuildIndexInternalLayer(std::shared_pt int currentLayer = static_cast(m_extraSearchers.size()); COMMON::Dataset localToGlobalID; { - if (currentLayer > 0) { - std::shared_ptr ptr = SPTAG::f_createIO(); - if (ptr == nullptr || - !ptr->Initialize((m_options.m_indexDirectory + FolderSep + m_options.m_headIDFile).c_str(), - std::ios::binary | std::ios::in)) + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Loading headIDFile for layer %d...\n", currentLayer - 1); + std::string localToGlobalIDPath = (currentLayer == 0)? m_options.m_globalIDPath : m_options.m_indexDirectory + FolderSep + m_options.m_headIDFile; + std::shared_ptr ptr = SPTAG::f_createIO(); + if (ptr == nullptr || + !ptr->Initialize(localToGlobalIDPath.c_str(), + std::ios::binary | std::ios::in)) + { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "No headIDFile file:%s\n", + localToGlobalIDPath.c_str()); + } + else { + localToGlobalID.Load(ptr, this->m_iDataBlockSize, this->m_iDataCapacity); + SizeType vectorCount = p_reader->GetVectorSet()->Count(); + if (localToGlobalID.R() != vectorCount) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "No headIDFile file:%s\n", - (m_options.m_indexDirectory + FolderSep + m_options.m_headIDFile).c_str()); - } - else { - localToGlobalID.Load(ptr, m_topIndex->m_iDataBlockSize, m_topIndex->m_iDataCapacity); + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "HeadIDFile count %lld doesn't match head vector file count %lld!\n", (int64_t)localToGlobalID.R(), (int64_t)vectorCount); + localToGlobalID.SetR(0); } - } else { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "Layer 0 build: skip loading localToGlobalID from %s\n", - (m_options.m_indexDirectory + FolderSep + m_options.m_headIDFile).c_str()); } } @@ -1150,6 +1152,7 @@ template ErrorCode Index::BuildIndexInternalLayer(std::shared_pt m_topIndex = SPTAG::VectorIndex::CreateInstance(m_options.m_indexAlgoType, valueType); m_topIndex->SetParameter("DistCalcMethod", SPTAG::Helper::Convert::ConvertToString(m_options.m_distCalcMethod)); + m_topIndex->SetParameter("ParallelBKTBuild", m_options.m_parallelBKTBuild ? "true" : "false"); m_topIndex->SetQuantizer(m_pQuantizer); for (const auto &iter : m_topParameters) { @@ -2008,9 +2011,9 @@ template void Index::PrepareDB(std::shared_ptr BuildIndex(const std::string &outDirectory, std::sh template std::shared_ptr BuildLargeIndex(const std::string &outDirectory, std::string &pvecset, - std::string& pmetaset, std::string& pmetaidx, const std::string &distMethod = "L2", + std::string& pmetaset, std::string& pmetaidx, Helper::IniReader& iniReader, const std::string &distMethod = "L2", int searchthread = 2, int insertthread = 2, int layers = 1, - std::shared_ptr quantizer = nullptr, std::string quantizerFilePath = "quantizer.bin", - const std::map& ssdOverrides = {}, - bool ssdOnly = false) + std::shared_ptr quantizer = nullptr, std::string quantizerFilePath = "quantizer.bin") { auto vecIndex = VectorIndex::CreateInstance(IndexAlgoType::SPANN, GetEnumValueType()); int maxthreads = std::thread::hardware_concurrency(); @@ -335,6 +333,7 @@ std::shared_ptr BuildLargeIndex(const std::string &outDirectory, st SplitFactor=0 SplitThreshold=0 Ratio=0.2 + ParallelBKTBuild=true [BuildHead] isExecute=true @@ -379,6 +378,7 @@ std::shared_ptr BuildLargeIndex(const std::string &outDirectory, st DeletePercentageForRefine=0.4 AsyncAppendQueueSize=0 AllowZeroReplica=false + ShareDB=true Layers=)" + std::to_string(layers) + R"( )"; @@ -399,29 +399,15 @@ std::shared_ptr BuildLargeIndex(const std::string &outDirectory, st } } - // Apply overrides (e.g., Storage, TiKV settings, SelectHead/BuildHead params) - for (const auto &[key, val] : ssdOverrides) + for (const auto &sec : sections) { - // Keys prefixed with "SectionName." are routed to the corresponding section - auto dotPos = key.find('.'); - if (dotPos != std::string::npos) { - std::string section = key.substr(0, dotPos); - std::string param = key.substr(dotPos + 1); - vecIndex->SetParameter(param.c_str(), val.c_str(), section.c_str()); - } else { - vecIndex->SetParameter(key.c_str(), val.c_str(), "BuildSSDIndex"); + auto params = iniReader.GetParameters(sec.c_str()); + for (const auto &[key, val] : params) + { + vecIndex->SetParameter(key.c_str(), val.c_str(), sec.c_str()); } } - // SSD-only mode: skip SelectHead and BuildHead, resume from specified layer - if (ssdOnly) - { - // Allow explicit ResumeLayer from config/overrides; otherwise default to layer 0 - // (rebuild SSD for all layers, reusing existing head indexes) - int resumeLayer = 0; - vecIndex->SetParameter("ResumeLayer", std::to_string(resumeLayer).c_str(), "BuildSSDIndex"); - } - if (quantizer) { vecIndex->SetParameter("QuantizerFilePath", quantizerFilePath.c_str(), "Base"); @@ -850,11 +836,9 @@ ErrorCode QuantizeVectors(const std::shared_ptr& quantizer, template void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, const std::string &truthPath, DistCalcMethod distMethod, const std::string &indexPath, int dimension, int baseVectorCount, - int insertVectorCount, int deleteVectorCount, int batches, int topK, int numSearchThreads, int numInsertThreads, int numQueries, + int insertVectorCount, int deleteVectorCount, int batches, int topK, int numSearchThreads, int numInsertThreads, int numSearchDuringInsertThreads, int numQueries, Helper::IniReader& iniReader, const std::string &outputFile = "output.json", const bool rebuild = true, const int resume = -1, - const std::string &quantizerFilePath = std::string(""), int quantizedDim = 0, int layers = 1, - const std::map& ssdOverrides = {}, - bool rebuildSsdOnly = false) + const std::string &quantizerFilePath = std::string(""), int quantizedDim = 0, int layers = 1) { int oldM = M, oldK = K, oldN = N, oldQueries = queries; N = baseVectorCount; @@ -923,18 +907,7 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c // Build initial index BOOST_TEST_MESSAGE("\n=== Building Index ==="); - if (rebuild || rebuildSsdOnly || !direxists(indexPath.c_str())) { - if (!rebuildSsdOnly) { - // Allow empty or non-existent directories; block only if index files already exist - if (direxists(indexPath.c_str()) && fileexists((indexPath + FolderSep + "indexloader.ini").c_str())) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, - "Index directory '%s' already exists with index files. Refusing to delete. " - "Remove it manually or use RebuildSSDOnly=true to resume.\n", - indexPath.c_str()); - BOOST_FAIL("Index directory already exists: " + indexPath); - return; - } - } + if (rebuild || !direxists(indexPath.c_str())) { auto buildstart = std::chrono::high_resolution_clock::now(); if (enableQuantization) @@ -959,13 +932,13 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c quantizedBase->Save(pquanvecset); } - index = BuildLargeIndex(indexPath, pquanvecset, pmeta, pmetaidx, dist, numSearchThreads, numInsertThreads, layers, quantizer, "quantizer.bin", ssdOverrides, rebuildSsdOnly); + index = BuildLargeIndex(indexPath, pquanvecset, pmeta, pmetaidx, iniReader, dist, numSearchThreads, numInsertThreads, layers, quantizer, "quantizer.bin"); BOOST_REQUIRE(index != nullptr); index->SetQuantizerADC(true); } else { - index = BuildLargeIndex(indexPath, pvecset, pmeta, pmetaidx, dist, numSearchThreads, numInsertThreads, layers, nullptr, "quantizer.bin", ssdOverrides, rebuildSsdOnly); + index = BuildLargeIndex(indexPath, pvecset, pmeta, pmetaidx, iniReader, dist, numSearchThreads, numInsertThreads, layers); BOOST_REQUIRE(index != nullptr); } @@ -1120,7 +1093,7 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c std::shared_ptr addmetaset = TestUtils::TestDataGenerator::LoadMetadataSet(paddmeta, paddmetaidx, insertStart, insertBatchSize); start = std::chrono::high_resolution_clock::now(); InsertVectors(static_cast *>(cloneIndex.get()), numInsertThreads, insertBatchSize, - addset, addmetaset, numSearchThreads, queryset, numQueries, SearchK, &jsonFile, 0); + addset, addmetaset, numSearchDuringInsertThreads, queryset, numQueries, SearchK, &jsonFile, 0); end = std::chrono::high_resolution_clock::now(); } seconds = @@ -2273,47 +2246,13 @@ BOOST_AUTO_TEST_CASE(BenchmarkFromConfig) int numSearchThreads = iniReader.GetParameter("Benchmark", "NumSearchThreads", 8); int numInsertThreads = iniReader.GetParameter("Benchmark", "NumInsertThreads", 8); int appendThreadNum = iniReader.GetParameter("Benchmark", "AppendThreadNum", 0); + int numSearchDuringInsertThreads = iniReader.GetParameter("Benchmark", "NumSearchDuringInsertThreads", 1); int numQueries = iniReader.GetParameter("Benchmark", "NumQueries", 1000); int layers = iniReader.GetParameter("Benchmark", "Layers", 1); DistCalcMethod distMethod = iniReader.GetParameter("Benchmark", "DistMethod", DistCalcMethod::L2); - bool rebuild = iniReader.GetParameter("Benchmark", "Rebuild", true); - bool rebuildSsdOnly = iniReader.GetParameter("Benchmark", "RebuildSSDOnly", false); + bool rebuild = (iniReader.GetParameter("Benchmark", "Rebuild", true) || iniReader.GetParameter("Benchmark", "RebuildSSDOnly", false)); int resume = iniReader.GetParameter("Benchmark", "Resume", -1); - // Read storage backend overrides for BuildSSDIndex - std::map ssdOverrides; - std::string storage = iniReader.GetParameter("Benchmark", "Storage", std::string("")); - if (!storage.empty()) { - ssdOverrides["Storage"] = storage; - } - std::string tikvPDAddresses = iniReader.GetParameter("Benchmark", "TiKVPDAddresses", std::string("")); - if (!tikvPDAddresses.empty()) { - ssdOverrides["TiKVPDAddresses"] = tikvPDAddresses; - } - std::string tikvKeyPrefix = iniReader.GetParameter("Benchmark", "TiKVKeyPrefix", std::string("")); - if (!tikvKeyPrefix.empty()) { - ssdOverrides["TiKVKeyPrefix"] = tikvKeyPrefix; - } - if (appendThreadNum > 0) { - ssdOverrides["AppendThreadNum"] = std::to_string(appendThreadNum); - } - - // Pass through any [BuildSSDIndex] section params from the ini as overrides - auto buildSSDParams = iniReader.GetParameters("BuildSSDIndex"); - for (const auto &[key, val] : buildSSDParams) { - ssdOverrides[key] = val; - } - - // Pass through [SelectHead] and [BuildHead] params as overrides too - auto selectHeadParams = iniReader.GetParameters("SelectHead"); - for (const auto &[key, val] : selectHeadParams) { - ssdOverrides["SelectHead." + key] = val; - } - auto buildHeadParams = iniReader.GetParameters("BuildHead"); - for (const auto &[key, val] : buildHeadParams) { - ssdOverrides["BuildHead." + key] = val; - } - BOOST_TEST_MESSAGE("=== Benchmark Configuration ==="); BOOST_TEST_MESSAGE("Vector Path: " << vectorPath); BOOST_TEST_MESSAGE("Query Path: " << queryPath); @@ -2324,6 +2263,7 @@ BOOST_AUTO_TEST_CASE(BenchmarkFromConfig) BOOST_TEST_MESSAGE("Top-K: " << topK); BOOST_TEST_MESSAGE("SearchThreads: " << numSearchThreads); BOOST_TEST_MESSAGE("InsertThreads: " << numInsertThreads); + BOOST_TEST_MESSAGE("SearchDuringInsertThreads: " << numSearchDuringInsertThreads); BOOST_TEST_MESSAGE("Queries: " << numQueries); BOOST_TEST_MESSAGE("Layers: " << layers); BOOST_TEST_MESSAGE("DistMethod: " << Helper::Convert::ConvertToString(distMethod)); @@ -2342,20 +2282,20 @@ BOOST_AUTO_TEST_CASE(BenchmarkFromConfig) if (valueType == VectorValueType::Float) { RunBenchmark(vectorPath, queryPath, truthPath, distMethod, indexPath, dimension, baseVectorCount, - insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numQueries, outputFile, - rebuild, resume, quantizerFilePath, quantizedDim, layers, ssdOverrides, rebuildSsdOnly); + insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, iniReader, + outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers); } else if (valueType == VectorValueType::Int8) { RunBenchmark(vectorPath, queryPath, truthPath, distMethod, indexPath, dimension, baseVectorCount, - insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numQueries, - outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers, ssdOverrides, rebuildSsdOnly); + insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, iniReader, + outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers); } else if (valueType == VectorValueType::UInt8) { RunBenchmark(vectorPath, queryPath, truthPath, distMethod, indexPath, dimension, baseVectorCount, - insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numQueries, - outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers, ssdOverrides, rebuildSsdOnly); + insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, iniReader, + outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers); } //std::filesystem::remove_all(indexPath); diff --git a/Test/src/VersionMapTest.cpp b/Test/src/VersionMapTest.cpp index ed50a769e..ecf5c7117 100644 --- a/Test/src/VersionMapTest.cpp +++ b/Test/src/VersionMapTest.cpp @@ -44,7 +44,7 @@ static std::unique_ptr MakeTiKVVersionMap(const std::string& tes auto now = std::chrono::steady_clock::now().time_since_epoch().count(); std::string prefix = "vmtest_" + testName + "_" + std::to_string(now) + "_"; - auto db = std::make_shared(std::string(pdAddr), prefix); + auto db = std::make_shared(std::string(pdAddr), prefix, false, 100000); auto vm = std::make_unique(); vm->SetDB(db); vm->SetLayer(0); diff --git a/Test/src/main.cpp b/Test/src/main.cpp index ab8d1342c..c1a5cde60 100644 --- a/Test/src/main.cpp +++ b/Test/src/main.cpp @@ -7,7 +7,9 @@ #include #include +#ifdef TIKV #include +#endif using namespace boost::unit_test; @@ -36,8 +38,9 @@ struct GlobalFixture // adds GraphCycles bookkeeping under a global spinlock on every Lock(); // observed to consume ~12% CPU under high worker-thread parallelism in // gRPC client paths (perf-recorded 2026-05-06). - absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore); - +#ifdef TIKV + absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore); +#endif SPTAGVisitor visitor; traverse_test_tree(framework::master_test_suite(), visitor, false); } diff --git a/benchmark.ini b/benchmark.ini deleted file mode 100644 index e2b400767..000000000 --- a/benchmark.ini +++ /dev/null @@ -1,19 +0,0 @@ -[Benchmark] -VectorPath=sift1b/base.100M.u8bin -QueryPath=sift1b/query.public.10K.u8bin -TruthPath=none -IndexPath=proidx/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=10000 -InsertVectorCount=10000 -DeleteVectorCount=0 -BatchNum=10 -TopK=5 -NumThreads=8 -NumQueries=100 -DistMethod=L2 -Rebuild=true -Resume=-1 -QuantizerFilePath=quantizer.bin -QuantizedDim=64 diff --git a/evaluation/2026-05-08-merged_spfresh/benchmark.ini b/evaluation/2026-05-08-merged_spfresh/benchmark.ini new file mode 100644 index 000000000..43db313d4 --- /dev/null +++ b/evaluation/2026-05-08-merged_spfresh/benchmark.ini @@ -0,0 +1,36 @@ +[Benchmark] +VectorPath=sift1b/base.1B.u8bin +QueryPath=sift1b/query.public.10K.u8bin +TruthPath=truth_1m_l2_batchget +IndexPath=/mnt/nvme/qi/index_1m_final/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=1000000 +InsertVectorCount=10000000 +DeleteVectorCount=0 +BatchNum=10 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=16 +NumSearchDuringInsertThreads=1 +NumQueries=1000 +DistMethod=L2 +Rebuild=true +Resume=-1 +Layers=2 + +[SelectHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=100000 +Storage=TIKVIO +TiKVPDAddresses=127.0.0.1:23791,127.0.0.1:23792,127.0.0.1:23793 +TiKVKeyPrefix=qi_1m_l2 +AppendThreadNum=48