-
Notifications
You must be signed in to change notification settings - Fork 522
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
parallelizing L2 cache lookup (#3032)
Summary: Pull Request resolved: #3032 X-link: facebookresearch/FBGEMM#130 Change sets 1. instead of allocate an intermediate tensor to collect the L2 cache miss info, we will do all the embeddings copy inside the originally provided tensor and mark related indices to -1 2. paralizing the cache lookup logic using multiple cachelib pools which helps reduce the LRU contention 3. fix cachelib->UVA tensor data copy bug(wrong offset) Reviewed By: ehsanardestani Differential Revision: D61417947
- Loading branch information
1 parent
a9a3713
commit 4237e3a
Showing
17 changed files
with
437 additions
and
263 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
142 changes: 142 additions & 0 deletions
142
fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache/cachelib_cache.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
/* | ||
* Copyright (c) Meta Platforms, Inc. and affiliates. | ||
* All rights reserved. | ||
* | ||
* This source code is licensed under the BSD-style license found in the | ||
* LICENSE file in the root directory of this source tree. | ||
*/ | ||
|
||
#pragma once | ||
#include <ATen/ATen.h> | ||
#include <cachelib/allocator/CacheAllocator.h> | ||
#include <cachelib/facebook/admin/CacheAdmin.h> | ||
|
||
#include <cstdint> | ||
#include <iostream> | ||
#include "fbgemm_gpu/split_embeddings_cache/kv_db_cpp_utils.h" | ||
|
||
namespace l2_cache { | ||
|
||
/// @ingroup embedding-ssd | ||
/// | ||
/// @brief A Cachelib wrapper class for Cachlib interaction | ||
/// | ||
/// It is for maintaining all the cache related operations, including | ||
/// initialization, insertion, lookup and eviction. | ||
/// It is stateful for eviction logic that caller has to specifically | ||
/// fetch and reset eviction related states. | ||
/// Cachelib related optimization will be captured inside this class | ||
/// e.g. fetch and delayed markUseful to boost up get performance | ||
/// | ||
/// @note that this class only handles single Cachelib read/update. | ||
/// parallelism is done on the caller side | ||
class CacheLibCache { | ||
public: | ||
using Cache = facebook::cachelib::LruAllocator; | ||
struct CacheConfig { | ||
size_t cacheSizeBytes; | ||
}; | ||
|
||
explicit CacheLibCache(size_t cacheSizeBytes, int64_t num_shards) | ||
: cacheConfig_(CacheConfig{.cacheSizeBytes = cacheSizeBytes}), | ||
cache_(initializeCacheLib(cacheConfig_)), | ||
admin_(createCacheAdmin(*cache_)) { | ||
for (int i = 0; i < num_shards; i++) { | ||
pool_ids_.push_back(cache_->addPool( | ||
fmt::format("shard_{}", i), | ||
cache_->getCacheMemoryStats().ramCacheSize / num_shards)); | ||
} | ||
} | ||
|
||
std::unique_ptr<Cache> initializeCacheLib(const CacheConfig& config) { | ||
Cache::Config cacheLibConfig; | ||
cacheLibConfig.setCacheSize(static_cast<uint64_t>(config.cacheSizeBytes)) | ||
.setCacheName("TBEL2Cache") | ||
.setAccessConfig({25 /* bucket power */, 10 /* lock power */}) | ||
.setFullCoredump(false) | ||
.validate(); | ||
return std::make_unique<Cache>(cacheLibConfig); | ||
} | ||
|
||
std::unique_ptr<facebook::cachelib::CacheAdmin> createCacheAdmin( | ||
Cache& cache) { | ||
facebook::cachelib::CacheAdmin::Config adminConfig; | ||
adminConfig.oncall = "mvai"; | ||
return std::make_unique<facebook::cachelib::CacheAdmin>( | ||
cache, std::move(adminConfig)); | ||
} | ||
|
||
/// Find the stored embeddings from a given embedding indices, aka key | ||
/// | ||
/// @param key embedding index to look up | ||
/// | ||
/// @return an optional value, return none on cache misses, if cache hit | ||
/// return a pointer to the cachelib underlying storage of associated | ||
/// embeddings | ||
/// | ||
/// @note that this is not thread safe, caller needs to make sure the data is | ||
/// fully processed before doing cache insertion, otherwise the returned space | ||
/// might be overwritten if cache is full | ||
std::optional<void*> get(int64_t key) { | ||
auto key_str = folly::StringPiece( | ||
reinterpret_cast<const char*>(&key), sizeof(int64_t)); | ||
auto item = cache_->find(key_str); | ||
if (!item) { | ||
return std::nullopt; | ||
} | ||
return const_cast<void*>(item->getMemory()); | ||
} | ||
|
||
/// Cachelib wrapper specific hash function | ||
/// | ||
/// @param key embedding index to get hashed | ||
/// | ||
/// @return an hashed value ranges from [0, num_pools) | ||
size_t get_shard_id(int64_t key) { | ||
return kv_db_utils::hash_shard(key, pool_ids_.size()); | ||
} | ||
|
||
/// get pool id given an embedding index | ||
/// | ||
/// @param key embedding index to get pool id | ||
/// | ||
/// @return a pool id associated with the given key, this is to build a | ||
/// deterministic mapping from a embedding index to a specific pool id | ||
facebook::cachelib::PoolId get_pool_id(int64_t key) { | ||
return pool_ids_[get_shard_id(key)]; | ||
} | ||
|
||
/// Add an embedding index and embeddings into cachelib | ||
/// | ||
/// @param key embedding index to insert | ||
/// | ||
/// @return true on success insertion, false on failure insertion, a failure | ||
/// insertion could happen if the refcount of bottom K items in LRU queue | ||
/// isn't 0. | ||
|
||
/// @note In training use case, this is not expected to happen as we do | ||
/// bulk read and bluk write sequentially | ||
/// | ||
/// @note cache_->allocation will trigger eviction callback func | ||
bool put(int64_t key, const at::Tensor& data) { | ||
auto key_str = folly::StringPiece( | ||
reinterpret_cast<const char*>(&key), sizeof(int64_t)); | ||
auto item = cache_->allocate(get_pool_id(key), key_str, data.nbytes()); | ||
if (!item) { | ||
XLOG(ERR) << fmt::format( | ||
"Failed to allocate item {} in cache, skip", key); | ||
return false; | ||
} | ||
std::memcpy(item->getMemory(), data.data_ptr(), data.nbytes()); | ||
cache_->insertOrReplace(std::move(item)); | ||
return true; | ||
} | ||
|
||
private: | ||
const CacheConfig cacheConfig_; | ||
std::unique_ptr<Cache> cache_; | ||
std::vector<facebook::cachelib::PoolId> pool_ids_; | ||
std::unique_ptr<facebook::cachelib::CacheAdmin> admin_; | ||
}; | ||
|
||
} // namespace l2_cache |
32 changes: 32 additions & 0 deletions
32
fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache/kv_db_cpp_utils.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
/* | ||
* Copyright (c) Meta Platforms, Inc. and affiliates. | ||
* All rights reserved. | ||
* | ||
* This source code is licensed under the BSD-style license found in the | ||
* LICENSE file in the root directory of this source tree. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <folly/hash/Hash.h> | ||
#include <stddef.h> | ||
#include <stdint.h> | ||
|
||
namespace kv_db_utils { | ||
|
||
/// @ingroup embedding-ssd | ||
/// | ||
/// @brief hash function used for SSD L2 cache and rocksdb sharding algorithm | ||
/// | ||
/// @param id sharding key | ||
/// @param num_shards sharding range | ||
/// | ||
/// @return shard id ranges from [0, num_shards) | ||
inline size_t hash_shard(int64_t id, size_t num_shards) { | ||
auto hash = folly::hash::fnv64_buf( | ||
reinterpret_cast<const char*>(&id), sizeof(int64_t)); | ||
__uint128_t wide = __uint128_t{num_shards} * hash; | ||
return static_cast<size_t>(wide >> 64); | ||
} | ||
|
||
}; // namespace kv_db_utils |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.