From de883ea9fa4592b4df387e781a8f8ee92ae0f4cb Mon Sep 17 00:00:00 2001 From: Lin Date: Tue, 16 Mar 2021 15:59:48 +0800 Subject: [PATCH 1/2] see changelog0316 --- .vscode/settings.json | 5 ++ CMakeLists.txt | 1 + hnswlib/bruteforce.h | 2 +- hnswlib/hnswalg.h | 92 +++++++++++----------------- hnswlib/hnswlib.h | 2 +- hnswlib/visited_list_pool.h | 16 ++--- mnist.cpp | 117 ++++++++++++++++++++++++++++++++++++ 7 files changed, 169 insertions(+), 66 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 mnist.cpp diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..5e09a03c --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "files.associations": { + "chrono": "cpp" + } +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 31935e0e..28838428 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,5 +24,6 @@ endif() add_executable(test_updates examples/updates_test.cpp) add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp) +add_executable(mnist mnist.cpp) target_link_libraries(main sift_test) diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index 24260400..33ebaa4a 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -43,7 +43,7 @@ namespace hnswlib { std::unordered_map dict_external_to_internal; - void addPoint(const void *datapoint, labeltype label) { + void addPoint(const void *datapoint, labeltype label, bool bUpdate = false) { int idx; { diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index a2f72dc7..e90f522c 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -73,7 +73,7 @@ namespace hnswlib { struct CompareByFirst { constexpr bool operator()(std::pair const &a, std::pair const &b) const noexcept { - return a.first < b.first; + return a.first > b.first; //let the smaller at the top } }; @@ -159,13 +159,13 @@ namespace hnswlib { } - std::priority_queue, std::vector>, CompareByFirst> + std::priority_queue> searchBaseLayer(tableint ep_id, const void *data_point, int layer) { VisitedList *vl = visited_list_pool_->getFreeVisitedList(); vl_type *visited_array = vl->mass; vl_type visited_array_tag = vl->curV; - std::priority_queue, std::vector>, CompareByFirst> top_candidates; + std::priority_queue> top_candidates; std::priority_queue, std::vector>, CompareByFirst> candidateSet; dist_t lowerBound; @@ -173,16 +173,16 @@ namespace hnswlib { dist_t dist = fstdistfunc_(data_point, getDataByInternalId(ep_id), dist_func_param_); top_candidates.emplace(dist, ep_id); lowerBound = dist; - candidateSet.emplace(-dist, ep_id); + candidateSet.emplace(dist, ep_id); } else { lowerBound = std::numeric_limits::max(); - candidateSet.emplace(-lowerBound, ep_id); + candidateSet.emplace(lowerBound, ep_id); } visited_array[ep_id] = visited_array_tag; while (!candidateSet.empty()) { std::pair curr_el_pair = candidateSet.top(); - if ((-curr_el_pair.first) > lowerBound) { + if (curr_el_pair.first > lowerBound) { break; } candidateSet.pop(); @@ -191,13 +191,7 @@ namespace hnswlib { std::unique_lock lock(link_list_locks_[curNodeNum]); - int *data;// = (int *)(linkList0_ + curNodeNum * size_links_per_element0_); - if (layer == 0) { - data = (int*)get_linklist0(curNodeNum); - } else { - data = (int*)get_linklist(curNodeNum, layer); -// data = (int *) (linkLists_[curNodeNum] + (layer - 1) * size_links_per_element_); - } + int *data = (int*)get_linklist_at_level(curNodeNum, layer); size_t size = getListCount((linklistsizeint*)data); tableint *datal = (tableint *) (data + 1); #ifdef USE_SSE @@ -220,7 +214,7 @@ namespace hnswlib { dist_t dist1 = fstdistfunc_(data_point, currObj1, dist_func_param_); if (top_candidates.size() < ef_construction_ || lowerBound > dist1) { - candidateSet.emplace(-dist1, candidate_id); + candidateSet.emplace(dist1, candidate_id); #ifdef USE_SSE _mm_prefetch(getDataByInternalId(candidateSet.top().second), _MM_HINT_T0); #endif @@ -245,13 +239,13 @@ namespace hnswlib { mutable std::atomic metric_hops; template - std::priority_queue, std::vector>, CompareByFirst> + std::priority_queue> searchBaseLayerST(tableint ep_id, const void *data_point, size_t ef) const { VisitedList *vl = visited_list_pool_->getFreeVisitedList(); vl_type *visited_array = vl->mass; vl_type visited_array_tag = vl->curV; - std::priority_queue, std::vector>, CompareByFirst> top_candidates; + std::priority_queue> top_candidates; std::priority_queue, std::vector>, CompareByFirst> candidate_set; dist_t lowerBound; @@ -259,10 +253,10 @@ namespace hnswlib { dist_t dist = fstdistfunc_(data_point, getDataByInternalId(ep_id), dist_func_param_); lowerBound = dist; top_candidates.emplace(dist, ep_id); - candidate_set.emplace(-dist, ep_id); + candidate_set.emplace(dist, ep_id); } else { lowerBound = std::numeric_limits::max(); - candidate_set.emplace(-lowerBound, ep_id); + candidate_set.emplace(lowerBound, ep_id); } visited_array[ep_id] = visited_array_tag; @@ -271,7 +265,7 @@ namespace hnswlib { std::pair current_node_pair = candidate_set.top(); - if ((-current_node_pair.first) > lowerBound) { + if (current_node_pair.first > lowerBound) { break; } candidate_set.pop(); @@ -308,7 +302,7 @@ namespace hnswlib { dist_t dist = fstdistfunc_(data_point, currObj1, dist_func_param_); if (top_candidates.size() < ef || lowerBound > dist) { - candidate_set.emplace(-dist, candidate_id); + candidate_set.emplace(dist, candidate_id); #ifdef USE_SSE _mm_prefetch(data_level0_memory_ + candidate_set.top().second * size_data_per_element_ + offsetLevel0_,/////////// @@ -332,17 +326,15 @@ namespace hnswlib { return top_candidates; } - void getNeighborsByHeuristic2( - std::priority_queue, std::vector>, CompareByFirst> &top_candidates, - const size_t M) { + void getNeighborsByHeuristic2(std::priority_queue> &top_candidates, const size_t M) { if (top_candidates.size() < M) { return; } - std::priority_queue> queue_closest; + std::priority_queue, std::vector>, CompareByFirst> queue_closest; std::vector> return_list; while (top_candidates.size() > 0) { - queue_closest.emplace(-top_candidates.top().first, top_candidates.top().second); + queue_closest.emplace(top_candidates.top().first, top_candidates.top().second); top_candidates.pop(); } @@ -350,7 +342,7 @@ namespace hnswlib { if (return_list.size() >= M) break; std::pair curent_pair = queue_closest.top(); - dist_t dist_to_query = -curent_pair.first; + dist_t dist_to_query = curent_pair.first; queue_closest.pop(); bool good = true; @@ -358,7 +350,7 @@ namespace hnswlib { dist_t curdist = fstdistfunc_(getDataByInternalId(second_pair.second), getDataByInternalId(curent_pair.second), - dist_func_param_);; + dist_func_param_); if (curdist < dist_to_query) { good = false; break; @@ -370,7 +362,7 @@ namespace hnswlib { } for (std::pair curent_pair : return_list) { - top_candidates.emplace(-curent_pair.first, curent_pair.second); + top_candidates.emplace(curent_pair.first, curent_pair.second); } } @@ -379,10 +371,6 @@ namespace hnswlib { return (linklistsizeint *) (data_level0_memory_ + internal_id * size_data_per_element_ + offsetLevel0_); }; - linklistsizeint *get_linklist0(tableint internal_id, char *data_level0_memory_) const { - return (linklistsizeint *) (data_level0_memory_ + internal_id * size_data_per_element_ + offsetLevel0_); - }; - linklistsizeint *get_linklist(tableint internal_id, int level) const { return (linklistsizeint *) (linkLists_[internal_id] + (level - 1) * size_links_per_element_); }; @@ -392,7 +380,7 @@ namespace hnswlib { }; tableint mutuallyConnectNewElement(const void *data_point, tableint cur_c, - std::priority_queue, std::vector>, CompareByFirst> &top_candidates, + std::priority_queue> &top_candidates, int level, bool isUpdate) { size_t Mcurmax = level ? maxM_ : maxM0_; getNeighborsByHeuristic2(top_candidates, M_); @@ -409,11 +397,7 @@ namespace hnswlib { tableint next_closest_entry_point = selectedNeighbors.back(); { - linklistsizeint *ll_cur; - if (level == 0) - ll_cur = get_linklist0(cur_c); - else - ll_cur = get_linklist(cur_c, level); + linklistsizeint *ll_cur = get_linklist_at_level(cur_c, level); if (*ll_cur && !isUpdate) { throw std::runtime_error("The newly inserted element should have blank link list"); @@ -435,11 +419,7 @@ namespace hnswlib { std::unique_lock lock(link_list_locks_[selectedNeighbors[idx]]); - linklistsizeint *ll_other; - if (level == 0) - ll_other = get_linklist0(selectedNeighbors[idx]); - else - ll_other = get_linklist(selectedNeighbors[idx], level); + linklistsizeint *ll_other = get_linklist_at_level(selectedNeighbors[idx], level); size_t sz_link_list_other = getListCount(ll_other); @@ -472,7 +452,7 @@ namespace hnswlib { dist_t d_max = fstdistfunc_(getDataByInternalId(cur_c), getDataByInternalId(selectedNeighbors[idx]), dist_func_param_); // Heuristic: - std::priority_queue, std::vector>, CompareByFirst> candidates; + std::priority_queue> candidates; candidates.emplace(d_max, cur_c); for (size_t j = 0; j < sz_link_list_other; j++) { @@ -528,8 +508,7 @@ namespace hnswlib { bool changed = true; while (changed) { changed = false; - int *data; - data = (int *) get_linklist(currObj,level); + int *data = (int *) get_linklist(currObj,level); int size = getListCount(data); tableint *datal = (tableint *) (data + 1); for (int i = 0; i < size; i++) { @@ -823,8 +802,8 @@ namespace hnswlib { *((unsigned short int*)(ptr))=*((unsigned short int *)&size); } - void addPoint(const void *data_point, labeltype label) { - addPoint(data_point, label,-1); + void addPoint(const void *data_point, labeltype label, bool bUpdate = false) { + addPoint(data_point, label, -1, bUpdate); } void updatePoint(const void *dataPoint, tableint internalId, float updateNeighborProbability) { @@ -866,7 +845,7 @@ namespace hnswlib { // if (neigh == internalId) // continue; - std::priority_queue, std::vector>, CompareByFirst> candidates; + std::priority_queue> candidates; size_t size = sCand.find(neigh) == sCand.end() ? sCand.size() : sCand.size() - 1; // sCand guaranteed to have size >= 1 size_t elementsToKeep = std::min(ef_construction_, size); for (auto&& cand : sCand) { @@ -889,8 +868,7 @@ namespace hnswlib { { std::unique_lock lock(link_list_locks_[neigh]); - linklistsizeint *ll_cur; - ll_cur = get_linklist_at_level(neigh, layer); + linklistsizeint *ll_cur = get_linklist_at_level(neigh, layer); size_t candSize = candidates.size(); setListCount(ll_cur, candSize); tableint *data = (tableint *) (ll_cur + 1); @@ -941,10 +919,10 @@ namespace hnswlib { throw std::runtime_error("Level of item to be updated cannot be bigger than max level"); for (int level = dataPointLevel; level >= 0; level--) { - std::priority_queue, std::vector>, CompareByFirst> topCandidates = searchBaseLayer( + std::priority_queue> topCandidates = searchBaseLayer( currObj, dataPoint, level); - std::priority_queue, std::vector>, CompareByFirst> filteredTopCandidates; + std::priority_queue> filteredTopCandidates; while (topCandidates.size() > 0) { if (topCandidates.top().second != dataPointInternalId) filteredTopCandidates.push(topCandidates.top()); @@ -977,7 +955,7 @@ namespace hnswlib { return result; }; - tableint addPoint(const void *data_point, labeltype label, int level) { + tableint addPoint(const void *data_point, labeltype label, int level, bool bUpdate = false) { tableint cur_c = 0; { @@ -985,7 +963,7 @@ namespace hnswlib { // if so, updating it *instead* of creating a new element. std::unique_lock templock_curr(cur_element_count_guard_); auto search = label_lookup_.find(label); - if (search != label_lookup_.end()) { + if (bUpdate && search != label_lookup_.end()) { tableint existingInternalId = search->second; templock_curr.unlock(); @@ -1073,7 +1051,7 @@ namespace hnswlib { if (level > maxlevelcopy || level < 0) // possible? throw std::runtime_error("Level error"); - std::priority_queue, std::vector>, CompareByFirst> top_candidates = searchBaseLayer( + std::priority_queue> top_candidates = searchBaseLayer( currObj, data_point, level); if (epDeleted) { top_candidates.emplace(fstdistfunc_(data_point, getDataByInternalId(enterpoint_copy), dist_func_param_), enterpoint_copy); @@ -1134,7 +1112,7 @@ namespace hnswlib { } } - std::priority_queue, std::vector>, CompareByFirst> top_candidates; + std::priority_queue> top_candidates; if (has_deletions_) { top_candidates=searchBaseLayerST( currObj, query_data, std::max(ef_, k)); diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 9409c388..fe21478b 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -69,7 +69,7 @@ namespace hnswlib { template class AlgorithmInterface { public: - virtual void addPoint(const void *datapoint, labeltype label)=0; + virtual void addPoint(const void *datapoint, labeltype label, bool bUpdate = false) = 0; virtual std::priority_queue> searchKnn(const void *, size_t) const = 0; // Return k nearest neighbor in the order of closer fist diff --git a/hnswlib/visited_list_pool.h b/hnswlib/visited_list_pool.h index 6b0f4458..9f86d9ea 100644 --- a/hnswlib/visited_list_pool.h +++ b/hnswlib/visited_list_pool.h @@ -2,27 +2,30 @@ #include #include +#include namespace hnswlib { typedef unsigned short int vl_type; class VisitedList { public: - vl_type curV; + vl_type curV, max_vl_type; vl_type *mass; unsigned int numelements; VisitedList(int numelements1) { - curV = -1; numelements = numelements1; - mass = new vl_type[numelements]; + curV = 0; + max_vl_type = std::numeric_limits::max(); + mass = new vl_type[numelements](); } void reset() { - curV++; - if (curV == 0) { - memset(mass, 0, sizeof(vl_type) * numelements); + if (curV < max_vl_type){ curV++; + } else { + memset(mass, 0, sizeof(vl_type) * numelements); + curV = 1; } }; @@ -75,4 +78,3 @@ namespace hnswlib { }; }; } - diff --git a/mnist.cpp b/mnist.cpp new file mode 100644 index 00000000..594e4619 --- /dev/null +++ b/mnist.cpp @@ -0,0 +1,117 @@ +/* + A simple example, tested under Linux. + first, copy train-labels.idx1-ubyte, train-images.idx3-ubyte, t10k-labels.idx1-ubyte, t10k-images.idx3-ubyte to the same path of the executable file. + then, ./mnist +*/ +#include +#include +#include +#include +#include +#include "hnswlib/hnswlib.h" + +using namespace hnswlib; + +//Returns the current resident set size (physical memory use) measured in Mb. +static size_t getCurrentRSS(){ + FILE *fp = fopen("/proc/self/statm", "r"); + if(fp == NULL) return (size_t)0L; + long rss; + if(fscanf(fp, "%*s%ld", &rss) != 1){ + fclose(fp); + return (size_t)0L; + } + fclose(fp); + return (size_t)rss*sysconf(_SC_PAGESIZE)/1024/1024; +} + +inline bool open_check(const char *name){ + FILE *fp = fopen(name, "r"); + if(fp == NULL) return false; + fclose(fp); + return true; +} + +int main(int argc, char *argv[]){ + if(!open_check("train-labels.idx1-ubyte") || !open_check("train-images.idx3-ubyte") || + !open_check("t10k-labels.idx1-ubyte") || !open_check("t10k-images.idx3-ubyte")){ + printf("open mnist files error.\n"); + return 0; + } + unsigned int efConstruction = 48, M = 16, vecdim = 784; + printf("efConstruction=%u, M=%u, vecdim=%u\n", efConstruction, M, vecdim); + + FILE *fp = fopen("train-labels.idx1-ubyte", "rb"); + unsigned char ubyte[16]; + size_t sz = fread(ubyte, 1, 8, fp); //ubyte[0-3] = 0X0081 + unsigned int trainSize = 16777216*ubyte[4] + 65536*ubyte[5] + 256*ubyte[6] + ubyte[7]; + unsigned char *trainCls = new unsigned char[trainSize]; + sz = fread(trainCls, 1, trainSize, fp); + fclose(fp); + + fp = fopen("train-images.idx3-ubyte", "rb"); + sz = fread(ubyte, 1, 16, fp); + unsigned char *trainSet = new unsigned char[trainSize*vecdim]; + for(unsigned int i=0; i *appr_alg; + std::chrono::steady_clock::time_point time_begin = std::chrono::steady_clock::now(); + if(open_check("mnist.bin")){ + printf("Loading index from mnist.bin:\n"); + appr_alg = new HierarchicalNSW(&l2space, "mnist.bin"); + printf("Actual memory usage: %d Mb\n", getCurrentRSS()); + }else{ + printf("Building index:\n"); + appr_alg = new HierarchicalNSW(&l2space, trainSize, M, efConstruction); + #pragma omp parallel for + for(unsigned int i = 0; i < trainSize; i++){ + appr_alg->addPoint((void*)&trainSet[i*vecdim], (size_t)trainCls[i]); + } + printf("Build time: %fs\n", 0.000001*std::chrono::duration_cast(std::chrono::steady_clock::now() - time_begin).count()); + appr_alg->saveIndex("mnist.bin"); + } + + printf("testing...\n"); + fp = fopen("t10k-labels.idx1-ubyte", "rb"); + sz = fread(ubyte, 1, 8, fp); //ubyte[0-3] = 0X0081 + unsigned int testSize = 16777216*ubyte[4] + 65536*ubyte[5] + 256*ubyte[6] + ubyte[7]; + unsigned char *testCls = new unsigned char[testSize]; + sz = fread(testCls, 1, testSize, fp); + fclose(fp); + + fp = fopen("t10k-images.idx3-ubyte", "rb"); + sz = fread(ubyte, 1, 16, fp); + unsigned char *testSet = new unsigned char[testSize*vecdim]; + for(unsigned int i=0; isetEf(ef); + time_begin = std::chrono::steady_clock::now(); + unsigned int correct = 0, total = 0; + #pragma omp parallel for reduction(+:total,correct) + for(unsigned int i = 0; i < testSize; i++){ + total += k; + std::priority_queue> result = appr_alg->searchKnn(testSet + vecdim * i, k); + while(result.size()){ + if(result.top().second == testCls[i]) correct++; + result.pop(); + } + } + printf("ef=%d, recall=%f, time=%.3fms\n", ef, double(correct)/total, 0.001*std::chrono::duration_cast(std::chrono::steady_clock::now() - time_begin).count()); + } + printf("Actual memory usage: %dMb\n", getCurrentRSS()); + + delete[] trainCls; + delete[] trainSet; + delete[] testCls; + delete[] testSet; + return 1; +} From 72b6e8f375916492898f023310e07ce6cd213e1f Mon Sep 17 00:00:00 2001 From: intstellar <51994487+intstellar@users.noreply.github.com> Date: Thu, 1 Apr 2021 12:08:18 +0800 Subject: [PATCH 2/2] Delete settings.json --- .vscode/settings.json | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 5e09a03c..00000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "files.associations": { - "chrono": "cpp" - } -} \ No newline at end of file